diff --git a/.gitignore b/.gitignore
index 62fed4090ef71208825df7eaae16c8e88df30355..cecd6fa91c754d0862d26a10833a83aa3ced819c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ build/
 build_doc/
 *.user
 *.tmp
+*.pyc
 
 .vscode
 .idea
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index df2e59b7647bf0231362a4220e8610f50243f1c5..2684529930e7ce2b1dba0bbfb3fb95968e0eadc7 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
         description: Format files with ClangFormat.
         entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$
 -   repo: local
     hooks:
     -   id: cpplint-cpp-source
@@ -48,7 +48,7 @@ repos:
         name: copyright_checker
         entry: python ./tools/codestyle/copyright.hook
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
         exclude: |
             (?x)^(
                 paddle/utils/.*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4c1b9c8098e9e632a4a05c491e07b1ce051c945..5b499fb43ab996b1c1780c0276faad2c37a8808a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -330,6 +330,7 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
+
 if(WITH_GPU)
     include(cuda)
     # lite subgraph compilation depends on CUDNN_ROOT,
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index 27210e5260048a57cc442fce4c6cf8657e401568..a7a9e85ffd7314ac7026fccdf45fae2fa3de09d3 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -99,7 +99,7 @@ endfunction()
 
 function(mlir_add_rewriter td_base)
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  mlir_tablegen(${td_base}.hpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
+  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
   add_public_tablegen_target(${td_base}_IncGen)
   add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
 endfunction()
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 5e60f1f2b99fee38fefc9b584f0d1d75b7c05e5b..415c0fe9bef9eab89e670d8b3f6f7c330b316ed8 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220215")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 847073fb7b57c255e82bd3f229e420a68b0af079..f7c17bd7cfe7e099e0afeaf623724e12387aff44 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -116,19 +116,19 @@ function(find_fluid_modules TARGET_NAME)
   endif()
 endfunction(find_fluid_modules)
 
-set_property(GLOBAL PROPERTY PTEN_MODULES "")
-# find all pten modules is used for paddle static library
+set_property(GLOBAL PROPERTY PHI_MODULES "")
+# find all phi modules is used for paddle static library
 # for building inference libs
-function(find_pten_modules TARGET_NAME)
+function(find_phi_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
   string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
   string(FIND "${__target_path}" "phi" pos)
   if(pos GREATER 1)
-    get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
-    set(pten_modules ${pten_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY PTEN_MODULES "${pten_modules}")
+    get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
+    set(phi_modules ${phi_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
   endif()
-endfunction(find_pten_modules)
+endfunction(find_phi_modules)
 
 function(common_link TARGET_NAME)
   if (WITH_PROFILER)
@@ -324,7 +324,7 @@ function(cc_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_pten_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
       endif()
     if(cc_library_DEPS)
       # Don't need link libwarpctc.so
@@ -497,7 +497,7 @@ function(nv_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_pten_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
       endif()
       if (nv_library_DEPS)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -588,7 +588,7 @@ function(hip_library TARGET_NAME)
       else()
         hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_pten_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
       endif()
       if (hip_library_DEPS)
         add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index b8d1f4eb116a96a5d3df92d56fade77ecd529b45..c48d31f7e4f90296ecc48acb56e619aae129106e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -224,7 +224,7 @@ copy(inference_lib_dist
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
-# copy api headers for pten & custom op
+# copy api headers for phi & custom op
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/api/ext/*.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext/)
@@ -244,11 +244,11 @@ copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/extension.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 
-# the header file of pten is copied to the experimental directory,
-# the include path of pten needs to be changed to adapt to inference api path
+# the header file of phi is copied to the experimental directory,
+# the include path of phi needs to be changed to adapt to inference api path
 add_custom_command(TARGET inference_lib_dist POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten_header.cmake"
-        COMMENT "Change pten header include path to adapt to inference api path")
+        COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/phi_header.cmake"
+        COMMENT "Change phi header include path to adapt to inference api path")
 
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 8469dc4c02ee37b333254d6d35b0eb48354d4b86..7affd59de162d5956672e5abfbf9f4b287fb7a83 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -73,6 +73,12 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
                 list(APPEND cu_srcs ${TARGET}.cu)
             endif()
+            # rename in KP: .kps -> .cu
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+                file(COPY ${TARGET}.kps DESTINATION  ${CMAKE_CURRENT_BINARY_DIR})
+                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+                list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+            endif()
             if (WITH_NV_JETSON)
                 list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu")
             endif()
@@ -96,6 +102,12 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
                 list(APPEND hip_srcs ${TARGET}.cu)
             endif()
+            # rename in KP: .kps -> .cu
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+                file(COPY ${TARGET}.kps DESTINATION  ${CMAKE_CURRENT_BINARY_DIR})
+                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+                list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+            endif()
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
                 set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
                         ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
@@ -125,6 +137,9 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
                 list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+                list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
+            endif()
         endif()
         if(WITH_ASCEND_CL)
             string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
@@ -162,6 +177,8 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${src})
             elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
                 list(APPEND xpu_kp_cc_srcs ${src})
+            elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
+                list(APPEND xpu_kp_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                 list(APPEND npu_cc_srcs ${src})
             elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
@@ -384,7 +401,15 @@ function(op_library TARGET)
 
     # pybind USE_OP_DEVICE_KERNEL for XPU KP
     if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
+        foreach(xpu_kp_src ${xpu_kp_cc_srcs})
+        set(op_name "")
+        find_register(${xpu_kp_src} "REGISTER_OP_KERNEL" op_name)
+        if(NOT ${op_name} EQUAL "")
+            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, KP);\n")
+            message(STATUS "Building KP Target: ${op_name}")
+            set(pybind_flag 1)
+        endif()
+        endforeach()
     endif()
 
     # pybind USE_OP_DEVICE_KERNEL for NPU
diff --git a/cmake/pten.cmake b/cmake/phi.cmake
similarity index 77%
rename from cmake/pten.cmake
rename to cmake/phi.cmake
index 6049f6e21e5662a8b45e6f77898f10c2220a70b5..d9132b84455e7309713b99f9e574bfceb83c7b6c 100644
--- a/cmake/pten.cmake
+++ b/cmake/phi.cmake
@@ -51,33 +51,41 @@ function(generate_unify_header DIR_NAME)
     endforeach()
     # append header into extension.h
     string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}")
-    file(APPEND ${pten_extension_header_file} "#include \"${header_file}\"\n")
+    file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n")
 endfunction()
 
 # call kernel_declare need to make sure whether the target of input exists
 function(kernel_declare TARGET_LIST)
     foreach(kernel_path ${TARGET_LIST})
         file(READ ${kernel_path} kernel_impl)
-        # TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL
-        # NOTE(chenweihang): now we don't recommend to use digit in kernel name
-        string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
+        string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*" first_registry "${kernel_impl}")
         if (NOT first_registry STREQUAL "")
+            # some gpu kernel only can run on cuda, not support rocm, so we add this branch
+            if (WITH_ROCM)
+                string(FIND "${first_registry}" "cuda_only" pos)
+                if(pos GREATER 1)
+                    continue()
+                endif()
+            endif()
             # parse the first kernel name
-            string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
-            string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
+            string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
+            string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
             string(REPLACE "," "" kernel_name "${kernel_name}")
             string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
+            string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}")
             # append kernel declare into declarations.h
             # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
             if (${kernel_path} MATCHES "./cpu\/")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./gpu\/")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./xpu\/")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./gpudnn\/")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
             else ()
                 # deal with device independent kernel, now we use CPU temporaary
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
             endif()
         endif()
     endforeach()
@@ -88,6 +96,7 @@ function(kernel_library TARGET)
     set(cpu_srcs)
     set(gpu_srcs)
     set(xpu_srcs)
+    set(gpudnn_srcs)
     set(selected_rows_srcs)
     # parse and save the deps kerenl targets
     set(all_srcs)
@@ -95,6 +104,8 @@ function(kernel_library TARGET)
 
     set(oneValueArgs SUB_DIR)
     set(multiValueArgs SRCS DEPS)
+    set(target_build_flag 1)
+
     cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
         "${multiValueArgs}" ${ARGN})
 
@@ -117,6 +128,9 @@ function(kernel_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
                 list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
+                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
+            endif()
         endif()
         if (WITH_XPU)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
@@ -135,6 +149,7 @@ function(kernel_library TARGET)
     list(APPEND all_srcs ${cpu_srcs})
     list(APPEND all_srcs ${gpu_srcs})
     list(APPEND all_srcs ${xpu_srcs})
+    list(APPEND all_srcs ${gpudnn_srcs})
     foreach(src ${all_srcs})
         file(READ ${src} target_content)
         string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
@@ -160,21 +175,22 @@ function(kernel_library TARGET)
     list(LENGTH cpu_srcs cpu_srcs_len)
     list(LENGTH gpu_srcs gpu_srcs_len)
     list(LENGTH xpu_srcs xpu_srcs_len)
+    list(LENGTH gpudnn_srcs gpudnn_srcs_len)
     list(LENGTH selected_rows_srcs selected_rows_srcs_len)
 
     # Build Target according different src organization
     if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR
-        ${selected_rows_srcs_len} GREATER 0))
+        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND
+        (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
         # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                 nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
             endif()
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                 hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
             endif()
         else()
@@ -184,14 +200,14 @@ function(kernel_library TARGET)
             endif()
         endif()
     # If there are only specific device srcs, build target using this rule.
-    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             endif()
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             endif()
         else()
             if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
@@ -228,35 +244,40 @@ function(kernel_library TARGET)
             cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
     else()
-        message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+        set(target_build_flag 0)
     endif()
 
-    if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
-        ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
-        ${selected_rows_srcs_len} GREATER 0)
-        # append target into PTEN_KERNELS property
-        get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
-        set(pten_kernels ${pten_kernels} ${TARGET})
-        set_property(GLOBAL PROPERTY PTEN_KERNELS ${pten_kernels})
-    endif()
+    if (${target_build_flag} EQUAL 1)
+        if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
+            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
+            ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
+            # append target into PHI_KERNELS property
+            get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
+            set(phi_kernels ${phi_kernels} ${TARGET})
+            set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels})
+        endif()
 
-    # parse kernel name and auto generate kernel declaration
-    # here, we don't need to check WITH_XXX, because if not WITH_XXX, the
-    # xxx_srcs_len will be equal to 0
-    if (${common_srcs_len} GREATER 0)
-        kernel_declare(${common_srcs})
-    endif()
-    if (${cpu_srcs_len} GREATER 0)
-        kernel_declare(${cpu_srcs})
-    endif()
-    if (${gpu_srcs_len} GREATER 0)
-        kernel_declare(${gpu_srcs})
-    endif()
-    if (${xpu_srcs_len} GREATER 0)
-        kernel_declare(${xpu_srcs})
-    endif()
-    if (${selected_rows_srcs_len} GREATER 0)
-        kernel_declare(${selected_rows_srcs})
+        # parse kernel name and auto generate kernel declaration
+        # here, we don't need to check WITH_XXX, because if not WITH_XXX, the
+        # xxx_srcs_len will be equal to 0
+        if (${common_srcs_len} GREATER 0)
+            kernel_declare(${common_srcs})
+        endif()
+        if (${cpu_srcs_len} GREATER 0)
+            kernel_declare(${cpu_srcs})
+        endif()
+        if (${gpu_srcs_len} GREATER 0)
+            kernel_declare(${gpu_srcs})
+        endif()
+        if (${xpu_srcs_len} GREATER 0)
+            kernel_declare(${xpu_srcs})
+        endif()
+        if (${gpudnn_srcs_len} GREATER 0)
+            kernel_declare(${gpudnn_srcs})
+        endif()
+        if (${selected_rows_srcs_len} GREATER 0)
+            kernel_declare(${selected_rows_srcs})
+        endif()
     endif()
 endfunction()
 
@@ -285,9 +306,9 @@ endfunction()
 
 function(append_op_util_declare TARGET)
     file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
-    string(REGEX MATCH "(PT_REGISTER_BASE_KERNEL_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
-    string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
-    string(REPLACE "PT_REGISTER_BASE_KERNEL_NAME" "PT_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
+    string(REGEX MATCH "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
+    string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
+    string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
     string(APPEND util_declare ");\n")
     file(APPEND ${op_utils_header} "${util_declare}")
 endfunction()
diff --git a/cmake/pten_header.cmake b/cmake/phi_header.cmake
similarity index 68%
rename from cmake/pten_header.cmake
rename to cmake/phi_header.cmake
index 6341aca9ec739449448726913aac7dcb349d5ea0..c9b7e465337dd1cf9ca35f5a595221600ab33ca7 100644
--- a/cmake/pten_header.cmake
+++ b/cmake/phi_header.cmake
@@ -14,8 +14,8 @@
 
 set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir")
 
-function(pten_header_path_compat TARGET_PATH)
-message(STATUS "pten header path compat processing: ${TARGET_PATH}")
+function(phi_header_path_compat TARGET_PATH)
+message(STATUS "phi header path compat processing: ${TARGET_PATH}")
 string(FIND ${TARGET_PATH} "experimental" pos)
 if (pos GREATER 1)
     file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
@@ -25,17 +25,17 @@ if (pos GREATER 1)
             string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/" HEADER_CONTENT "${HEADER_CONTENT}")
             string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/" HEADER_CONTENT "${HEADER_CONTENT}")
             file(WRITE ${header} "${HEADER_CONTENT}")
-            message(STATUS "pten header path compat processing complete: ${header}")
+            message(STATUS "phi header path compat processing complete: ${header}")
         endif()
     endforeach()
 endif()
 endfunction()
 
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental)
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api)
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext)
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include)
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
 
 # In order to be compatible with the original behavior, the header file name needs to be changed
 file(RENAME ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/extension.h
diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index f8ab9693db0c9ebe845ae7d77a562fd005f5130d..adab3e1423c91522092dac5503d8c58dcc8370db 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -17,7 +17,7 @@ if(NOT WITH_XPU_KP)
 endif()
 
 if(NOT XPU_TOOLCHAIN)
-  set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
+  set(XPU_TOOLCHAIN /workspace/output/XTDK-ubuntu_x86_64)
   get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
 endif()
 if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
@@ -102,7 +102,7 @@ macro(compile_kernel COMPILE_ARGS)
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
-  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
+  set(XPU_CXX_FLAGS  -fforce-enable-int128 -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
 
   #include path
   get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
@@ -127,9 +127,11 @@ macro(compile_kernel COMPILE_ARGS)
       kernel_build/${kernel_name}.bin.o
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+      cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
     COMMAND
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
-       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
+       -I.  -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
       ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
@@ -148,9 +150,11 @@ macro(compile_kernel COMPILE_ARGS)
       kernel_build/${kernel_name}.host.o
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+      cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
     COMMAND
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
-        -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
+        -I.  -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu
         --xpu-host-only -c -v 
     WORKING_DIRECTORY
       ${CMAKE_CURRENT_BINARY_DIR}
@@ -185,7 +189,7 @@ macro(xpu_add_library TARGET_NAME)
     # Distinguish .xpu file from other files
     foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
       get_filename_component(language_type_name ${cur_xpu_src} EXT)
-      if(${language_type_name} STREQUAL ".xpu")
+      if(${language_type_name} STREQUAL ".kps")
         list(APPEND xpu_kernel_lists ${cur_xpu_src})
       else()
         list(APPEND cc_kernel_lists ${cur_xpu_src})
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 5ae2e26e87c7b33a75325f5b585ca115bd3b6308..06b0583eddf24e344b4494f17472ad4bc9c18881 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(collective)
+add_subdirectory(store)
 if(NOT WITH_PSCORE)
     add_subdirectory(fleet_executor)
     return()
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..41652f8b6ed6f717ad8a571be8e7a16408b34504
--- /dev/null
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -0,0 +1,5 @@
+cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+
+if(WITH_NCCL)
+    cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
+endif()
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
new file mode 100644
index 0000000000000000000000000000000000000000..f30b96e72d4536b0773c9b69b6cb90b2c8c2dc87
--- /dev/null
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <error.h>
+#include <string>
+
+#include "boost/variant.hpp"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+#define NCCLCHECK(cmd)                                              \
+  do {                                                              \
+    ncclResult_t r = cmd;                                           \
+    if (r != ncclSuccess) {                                         \
+      printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \
+             platform::dynload::ncclGetErrorString(r));             \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper.
+// EventManage is different from paddle::platform::CudaEvent.
+// It uses lazy initialization and is only created when the
+// Record() method is called for the first time; it also monitors
+// device information to ensure that recorded stream and event
+// are on the same device.
+
+class EventManager {
+ public:
+  EventManager() {}
+  explicit EventManager(unsigned int flags) : flags_{flags} {}
+
+  ~EventManager() {
+    if (is_created_) {
+      platform::CUDADeviceGuard guard(device_index_);
+      cudaEventDestroy(event_);
+    }
+  }
+
+  EventManager(const EventManager&) = delete;
+  EventManager& operator=(const EventManager&) = delete;
+
+  EventManager(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+
+  EventManager& operator=(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  gpuEvent_t GetRawCudaEvent() const { return event_; }
+
+  void Record(const paddle::platform::CUDADeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index, device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "CUDADeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index, device_index_));
+
+    platform::CUDADeviceGuard guard(device_index_);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream()));
+  }
+
+  bool Query() const {
+    gpuError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) {
+      return true;
+    } else if (err == cudaErrorNotReady) {
+      return false;
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(err);
+      return false;
+    }
+  }
+
+  void Synchronize() const {
+    if (is_created_) {
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
+    }
+  }
+
+  void Block(const paddle::platform::CUDADeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index, device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CUDADeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index, device_index_));
+      platform::CUDADeviceGuard guard(device_index_);
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0));
+    }
+  }
+
+ private:
+  unsigned int flags_ = cudaEventDefault;
+  bool is_created_{false};
+  gpuEvent_t event_{};
+  int8_t device_index_{0};
+
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::CUDADeviceGuard guard(device_index);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_));
+    is_created_ = true;
+  }
+};
+
+// NOTE(shenliang03): NCCLCommManager is more lightweight than
+// platform::NCCLComm
+
+class NCCLCommManager {
+ public:
+  explicit NCCLCommManager(ncclComm_t ncclComm) : nccl_comm_(ncclComm) {}
+
+  NCCLCommManager() : NCCLCommManager(nullptr) {}
+
+  ~NCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (nccl_comm_) {
+      platform::dynload::ncclCommDestroy(nccl_comm_);
+    }
+  }
+
+  static std::shared_ptr<NCCLCommManager> Create(int num_ranks, int rank,
+                                                 ncclUniqueId comm_id) {
+    auto nccl_manager = std::make_shared<NCCLCommManager>();
+    NCCLCHECK(platform::dynload::ncclCommInitRank(&(nccl_manager->nccl_comm_),
+                                                  num_ranks, comm_id, rank));
+
+    nccl_manager->nccl_id_ = comm_id;
+    nccl_manager->rank_ = rank;
+    return nccl_manager;
+  }
+
+  ncclUniqueId GetNcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_id_;
+  }
+
+  ncclComm_t GetNcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_comm_;
+  }
+
+  NCCLCommManager(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(NCCLCommManager&& other) = delete;
+
+  NCCLCommManager(NCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(nccl_comm_, other.nccl_comm_);
+  }
+
+ protected:
+  ncclComm_t nccl_comm_;
+  ncclUniqueId nccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc
new file mode 100644
index 0000000000000000000000000000000000000000..42ca3bd5f5be49e72662d563ba6e20f3097840ef
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroup.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+
+namespace paddle {
+namespace distributed {
+
+ProcessGroup::Task::Task(int rank, const std::vector<Tensor>& inputTensors,
+                         CommType comm_type)
+    : rank_(rank), comm_type_(comm_type) {}
+
+ProcessGroup::Task::~Task() = default;
+
+bool ProcessGroup::Task::IsCompleted() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return is_completed_;
+}
+
+bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
+  return false;
+}
+
+void ProcessGroup::Task::Synchronize() {}
+
+ProcessGroup::ProcessGroup(int rank, int size) : rank_(rank), size_(size) {}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
new file mode 100644
index 0000000000000000000000000000000000000000..dde8622d9007e1372739d0fedde4938f85eda323
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+
+constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
+
+namespace paddle {
+namespace distributed {
+
+using Tensor = paddle::experimental::Tensor;
+
+enum class CommType : std::uint8_t {
+  BROADCAST = 0,
+  ALLREDUCE = 1,
+  ALLREDUCE_SPARSE = 2,  // TODO(shenliang03): to support sparse in allreduce
+  REDUCE = 3,
+  ALLGATHER = 4,
+  GATHER = 5,
+  SCATTER = 6,
+  REDUCE_SCATTER = 7,
+  ALLTOALL = 8,
+  SEND = 9,
+  RECV = 10,
+  BARRIER = 11,
+  UNKNOWN = 100,
+};
+
+struct ProcessGroupStrategy {
+  int nranks_{1};
+  int local_rank_{0};
+  std::vector<std::string> trainer_endpoints_{};
+  std::string current_endpoint_{""};
+  int nrings_{1};
+};
+
+class ProcessGroup {
+ public:
+  class Task {
+   public:
+    Task(int rank, const std::vector<Tensor>& inputTensors,
+         CommType opType = CommType::UNKNOWN);
+
+    virtual ~Task();
+    virtual bool IsCompleted();
+    virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    virtual void Synchronize();
+
+   protected:
+    const int rank_;
+    CommType comm_type_;
+    std::mutex mutex_;
+    bool is_completed_ = false;
+  };
+
+  explicit ProcessGroup(int rank, int size);
+  virtual ~ProcessGroup() {}
+
+  int GetRank() const { return rank_; }
+
+  int GetSize() const { return size_; }
+
+  virtual const std::string GetBackendName() const = 0;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& /* tensors */,
+      const AllreduceOptions& = AllreduceOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& /* tensors */,
+      const BroadcastOptions& = BroadcastOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce", GetBackendName()));
+  }
+
+ protected:
+  const int rank_;
+  const int size_;
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe2325423b460d7b42e08b03cf9b083bc94fc7b6
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -0,0 +1,321 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+
+DECLARE_bool(nccl_blocking_wait);
+DECLARE_bool(use_stream_safe_cuda_allocator);
+
+constexpr int64_t kWaitBlockTImeout = 10;
+
+namespace paddle {
+namespace distributed {
+
+static ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, ncclRedOp_t> red_type = {
+      {ReduceOp::MIN, ncclMin},
+      {ReduceOp::MAX, ncclMax},
+      {ReduceOp::SUM, ncclSum},
+      {ReduceOp::PRODUCT, ncclProd},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != red_type.end(), true,
+                    platform::errors::InvalidArgument(
+                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
+                        "ncclProd | ncclSum"));
+  return it->second;
+}
+
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
+  std::ostringstream oss;
+  for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.inner_place());
+  }
+  return places;
+}
+
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+
+bool CheckTensorsInCudaPlace(const std::vector<Tensor>& tensors) {
+  return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+    return t.place() == PlaceType::kGPU;
+  });
+}
+
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<EventManager>& ncclEvents,                       // NOLINT
+    std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    ncclEvents[i].Record(*dev_ctx[i]);
+    ncclEvents[i].Block(*default_ctx);
+  }
+}
+
+std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
+    std::vector<Place> places, int rank, CommType comm_type,
+    const std::vector<Tensor>& inputs) {
+  return std::make_shared<ProcessGroupNCCL::NCCLTask>(places, rank, comm_type,
+                                                      inputs);
+}
+
+ProcessGroupNCCL::NCCLTask::NCCLTask(const std::vector<Place>& places, int rank,
+                                     CommType CommType,
+                                     const std::vector<Tensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  ncclComms_.resize(places.size());
+}
+
+ProcessGroupNCCL::NCCLTask::~NCCLTask() {}
+
+void ProcessGroupNCCL::NCCLTask::SetOutputs(
+    std::vector<Tensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<Tensor>>(outputs);
+}
+
+void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
+  }
+}
+
+bool ProcessGroupNCCL::NCCLTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// TODO(sheniang03): Add timeout for wait, now timeout unused
+bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  if (FLAGS_nccl_blocking_wait) {
+    // NOTE(shenliang03): It will block host for sync
+    while (!IsCompleted()) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+    }
+  }
+  return true;
+}
+
+// Same as Wait
+void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
+
+ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy,
+                                   int rank, int size)
+    : ProcessGroup(rank, size), strategy_(strategy) {}
+
+void ProcessGroupNCCL::BcastNCCLId(
+    std::vector<ncclUniqueId>& nccl_ids,  // NOLINT
+    int root, int server_fd) {
+  if (strategy_.local_rank_ == root) {
+    std::vector<std::string> other_trainers;
+    for (auto& ep : strategy_.trainer_endpoints_) {
+      if (ep != strategy_.current_endpoint_) {
+        other_trainers.push_back(ep);
+      }
+    }
+    platform::SendBroadCastCommID(other_trainers, &nccl_ids);
+  } else {
+    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
+                                  &nccl_ids);
+  }
+}
+
+void ProcessGroupNCCL::BroadcastUniqueNCCLID(
+    std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
+
+  int server_fd = -1;
+  if (rank_ != 0) {
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
+  }
+  BcastNCCLId(nccl_ids, 0, server_fd);
+}
+
+// create NCCLManager cache for places_key
+void ProcessGroupNCCL::CreateNCCLManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the NCCL Communicator since "
+                        "the GPU place are not known"));
+
+  std::vector<std::shared_ptr<NCCLCommManager>> nccl_comms;
+  nccl_comms.resize(places.size());
+
+  // using vector just for broadcast
+  std::vector<ncclUniqueId> nccl_ids;
+  nccl_ids.resize(1);
+  auto& nccl_id = nccl_ids.front();
+
+  if (rank_ == 0) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
+  }
+  BroadcastUniqueNCCLID(nccl_ids);
+
+  VLOG(3) << "init nccl rank: " << strategy_.local_rank_
+          << ", nranks: " << strategy_.nranks_ << ", place: " << places_key
+          << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
+
+  std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+
+  for (size_t i = 0; i < places.size(); ++i) {
+    platform::CUDADeviceGuard guard(places[i]);
+    nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
+    dev_ctx[i].reset(new CUDADeviceContext(places[i]));
+  }
+
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+
+  std::vector<EventManager> events;
+  events.resize(places.size());
+
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_ncclcomm_.emplace(places_key, std::move(nccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
+    std::vector<Tensor>& inputs, std::vector<Tensor>& outputs, Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+
+  auto& nccl_comms = places_to_ncclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+
+  // construct uninitialize guard for device
+  platform::CUDADeviceGuard cuda_guard;
+
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      auto dense_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
+      memory::RecordStream(dense_tensor->Holder(),
+                           places_to_ctx_[key][i]->stream());
+    }
+  }
+
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream);
+    }
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    cuda_guard.SetDevice(places[i]);
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
+    std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclAllReduce(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
+    std::vector<Tensor>& tensors, const BroadcastOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+
+  return Collective(
+      tensors, tensors,
+      [&](Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        const auto root = opts.source_rank * tensors.size() + opts.source_root;
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclBcast(
+            input_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()), root, comm, stream);
+      },
+      CommType::BROADCAST);
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f06566d1c86386acad3758be283e716f46c1951
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+
+constexpr const char* NCCL_BACKEND_NAME = "NCCL";
+
+namespace paddle {
+namespace distributed {
+
+using Place = paddle::platform::Place;
+using CUDAStream = platform::stream::CUDAStream;
+using CUDADeviceContext = paddle::platform::CUDADeviceContext;
+
+class ProcessGroupNCCL : public ProcessGroup {
+ public:
+  class NCCLTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<NCCLTask> {
+   public:
+    NCCLTask(const std::vector<Place>& places, int rank, CommType CommType,
+             const std::vector<Tensor>& inputs);
+
+    bool IsCompleted();
+
+    void SynchronizeStreams();
+
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+
+    void Synchronize();
+
+    void SetOutputs(std::vector<Tensor>& outputs);  // NOLINT
+
+    virtual ~NCCLTask();
+
+    std::vector<EventManager> control_events_;
+
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<NCCLCommManager>> ncclComms_;
+    std::shared_ptr<std::vector<Tensor>> outputs_;
+
+   private:
+  };
+
+  ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size);
+
+  const std::string GetBackendName() const override {
+    return std::string(NCCL_BACKEND_NAME);
+  }
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+ protected:
+  virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
+      std::vector<Place> places, int rank, CommType opType,
+      const std::vector<Tensor>& inputs);
+
+ protected:
+  ProcessGroupStrategy strategy_;
+  std::shared_ptr<NCCLCommManager> nccl_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLCommManager>>>
+      places_to_ncclcomm_;
+
+  std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;
+
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<CUDADeviceContext>>>
+      places_to_ctx_;
+
+ private:
+  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root,  // NOLINT
+                   int server_fd);
+
+  void BroadcastUniqueNCCLID(std::vector<ncclUniqueId>& nccl_ids);  // NOLINT
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<Tensor>& inputs,   // NOLINT
+      std::vector<Tensor>& outputs,  // NOLINT
+      Fn fn, CommType op_type);
+
+  void CreateNCCLManagerCache(const std::string& places_key,
+                              const std::vector<Place>& places);
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h
new file mode 100644
index 0000000000000000000000000000000000000000..654d06686957bd4242fa474c215ccf7c117e5910
--- /dev/null
+++ b/paddle/fluid/distributed/collective/Types.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <chrono>
+#include <cstdint>
+#include <vector>
+
+namespace paddle {
+namespace distributed {
+
+// TODO(shenliang03): To support AVG for reduce
+enum class ReduceOp : std::uint8_t { SUM = 0, AVG, MAX, MIN, PRODUCT };
+
+struct AllreduceOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+};
+
+struct BroadcastOptions {
+  int source_rank = 0;
+  int source_root = 0;
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index e684d75bfb8320df06813bbe4e61fcd7d0c9d934..c1408130b5e577e54a4062316a4868701338864d 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -52,6 +52,8 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
     input_tensor_ptr = input_tensor->mutable_data<float>(dims, place);
   } else if (input_data.dtype == DistModelDataType::INT32) {
     input_tensor_ptr = input_tensor->mutable_data<int32_t>(dims, place);
+  } else if (input_data.dtype == DistModelDataType::FLOAT16) {
+    input_tensor_ptr = input_tensor->mutable_data<float16>(dims, place);
   } else {
     LOG(ERROR) << "unsupported feed type " << input_data.dtype;
     return false;
@@ -412,6 +414,8 @@ bool DistModel::PrepareFeedAndFetch() {
         feeds_to_dtype_.insert({var_name, DistModelDataType::INT32});
       } else if (real_var->GetDataType() == framework::proto::VarType::INT64) {
         feeds_to_dtype_.insert({var_name, DistModelDataType::INT64});
+      } else if (real_var->GetDataType() == framework::proto::VarType::FP16) {
+        feeds_to_dtype_.insert({var_name, DistModelDataType::FLOAT16});
       } else {
         LOG(ERROR) << "Don't support feed var dtype for: "
                    << real_var->GetDataType();
@@ -503,9 +507,13 @@ bool DistModel::FetchResults(std::vector<DistModelTensor> *output_data,
     } else if (type == framework::proto::VarType::INT32) {
       rst = FetchResult<int32_t>(fetch, output);
       output->dtype = DistModelDataType::INT32;
+    } else if (type == framework::proto::VarType::FP16) {
+      rst = FetchResult<float16>(fetch, output);
+      output->dtype = DistModelDataType::FLOAT16;
     } else {
       LOG(ERROR) << "DistModel meets unknown fetch data type. DistModel only "
-                    "supports float32, int64 and int32 fetch type for now.";
+                    "supports float32, float16, int64 and int32 fetch type "
+                    "for now.";
     }
     if (!rst) {
       LOG(ERROR) << "DistModel fails to fetch result " << idx_to_fetches_[idx];
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
index 6bdd858d6cf9ed78c1a655c28ed58574374ce3fb..dc8b2596803e074a7ca8cea069bf7d93ef1615e7 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
@@ -40,6 +41,11 @@ constexpr DistModelDataType DistModelGetDtype<float>() {
   return DistModelDataType::FLOAT32;
 }
 
+template <>
+constexpr DistModelDataType DistModelGetDtype<platform::float16>() {
+  return DistModelDataType::FLOAT16;
+}
+
 class DistModelDataBuf {
  public:
   explicit DistModelDataBuf(size_t length)
diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc
index 9f2a8eb24533d12ca289543ee7f75d2c05f9b2a3..2009ec772e1cf66d3997e3f4be8f2e67bf2c32e3 100644
--- a/paddle/fluid/distributed/ps/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -238,7 +238,7 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
 
   void* tensor_data = tensor->mutable_data(
       place,
-      framework::TransToPtenDataType(VarMessageToVarType(msg.data_type())));
+      framework::TransToPhiDataType(VarMessageToVarType(msg.data_type())));
 
   // IO Buffer
   if (platform::is_cpu_place(place)) {
@@ -281,7 +281,7 @@ void DeserializeSelectedRows(
   tensor->Resize(phi::make_ddim(vec_dim));
   void* tensor_data = tensor->mutable_data(
       place,
-      framework::TransToPtenDataType(VarMessageToVarType(msg.data_type())));
+      framework::TransToPhiDataType(VarMessageToVarType(msg.data_type())));
   // IO Buffer
   if (platform::is_cpu_place(place)) {
     unsigned long data_len;                                 // NOLINT
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator_common.h b/paddle/fluid/distributed/ps/service/communicator/communicator_common.h
index 66784c53c0026afa988119a506ef065181b0cb4d..27b282a945d1521c0a863bb0bb176c9492296b07 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator_common.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator_common.h
@@ -31,7 +31,8 @@ struct CommContext {
               const std::vector<std::string> &origin_names, int id,
               bool merge_add_ = true, bool is_sparse_ = true,
               bool is_distributed_ = false, int table_id_ = -1,
-              bool is_tensor_table_ = false)
+              bool is_tensor_table_ = false, bool is_datanorm_table_ = false,
+              int64_t program_id_ = -1)
       : var_name(name),
         splited_varnames(names),
         epmap(emap),
@@ -42,7 +43,9 @@ struct CommContext {
         is_sparse(is_sparse_),
         is_distributed(is_distributed_),
         table_id(table_id_),
-        is_tensor_table(is_tensor_table_) {}
+        program_id(program_id_),
+        is_tensor_table(is_tensor_table_),
+        is_datanorm_table(is_datanorm_table_) {}
 
   CommContext(const CommContext &ctx) {
     var_name = ctx.var_name;
@@ -55,7 +58,9 @@ struct CommContext {
     origin_varnames = ctx.origin_varnames;
     is_distributed = ctx.is_distributed;
     table_id = ctx.table_id;
+    program_id = ctx.program_id;
     is_tensor_table = ctx.is_tensor_table;
+    is_datanorm_table = ctx.is_datanorm_table;
   }
 
   std::string print() const {
@@ -78,7 +83,9 @@ struct CommContext {
     ss << " is_sparse: " << is_sparse;
     ss << " is_distributed: " << is_distributed << "\n";
     ss << " table_id: " << table_id << "\n";
+    ss << " program_id: " << program_id << "\n";
     ss << " is_tensor_table: " << is_tensor_table << "\n";
+    ss << " is_datanorm_table: " << is_datanorm_table << "\n";
 
     return ss.str();
   }
@@ -93,7 +100,9 @@ struct CommContext {
   bool is_sparse;
   bool is_distributed;
   int table_id;
+  int64_t program_id;
   bool is_tensor_table;
+  bool is_datanorm_table;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/store/CMakeLists.txt b/paddle/fluid/distributed/store/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1fde447d97dd99783a77a9a2ad89b4457b55ca74
--- /dev/null
+++ b/paddle/fluid/distributed/store/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(tcp_store SRCS tcp_store.cc tcp_utils.cc DEPS enforce glog)
diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h
new file mode 100644
index 0000000000000000000000000000000000000000..2673314d222d2b32e42c42a3a94df71a1887914a
--- /dev/null
+++ b/paddle/fluid/distributed/store/store.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/store/tcp_utils.h"
+
+namespace paddle {
+namespace distributed {
+
+class Store {
+ public:
+  Store() = delete;
+  explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {}
+  virtual ~Store() = default;
+
+  virtual int64_t add(const std::string& key, int64_t value) = 0;
+  virtual std::vector<uint8_t> get(const std::string& key) = 0;
+  virtual void wait(const std::string& key) = 0;
+
+  virtual const std::chrono::seconds& timeout() const { return _timeout; }
+
+ private:
+  std::chrono::seconds _timeout;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de85ac0d910e93257a308052ca1fcf193680a183
--- /dev/null
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -0,0 +1,272 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+#include "paddle/fluid/distributed/store/tcp_store.h"
+#include "paddle/fluid/distributed/store/tcp_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+namespace detail {
+
+constexpr int INFTIME = -1;
+
+std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) {
+  return std::make_unique<MasterDaemon>(socket);
+}
+
+MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) {
+  _background_thread = std::thread{&MasterDaemon::run, this};
+}
+
+MasterDaemon::~MasterDaemon() {
+  _background_thread.join();
+  tcputils::close_socket(_listen_socket);
+  for (SocketType socket : _sockets) {
+    tcputils::close_socket(socket);
+  }
+}
+
+void MasterDaemon::_do_add(SocketType socket) {
+  int64_t new_value{};
+  std::string key = tcputils::receive_string(socket);
+  new_value = tcputils::receive_value<int64_t>(socket);
+  std::vector<uint8_t> old_value;
+  auto it = _store.find(key);
+  if (it != _store.end()) {
+    old_value = it->second;
+    char* buffer = reinterpret_cast<char*>(it->second.data());
+    size_t len = old_value.size();
+    new_value += std::stoll(std::string(buffer, len));
+  }
+
+  std::string new_value_str = std::to_string(new_value);
+  _store[key] =
+      std::vector<uint8_t>(new_value_str.begin(), new_value_str.end());
+  VLOG(3) << "TCPStore: new value (" << new_value << ") for key (" << key
+          << ").";
+  tcputils::send_value<int64_t>(socket, new_value);
+}
+
+void MasterDaemon::_do_get(SocketType socket) {
+  std::string key = tcputils::receive_string(socket);
+  auto iter = _store.find(key);
+  PADDLE_ENFORCE_NE(
+      iter, _store.end(),
+      platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
+  std::vector<uint8_t> value = iter->second;
+  VLOG(3) << "TCPStore: value ("
+          << std::stoll(std::string(reinterpret_cast<char*>(value.data()),
+                                    value.size()))
+          << ") for key (" << key << ").";
+  tcputils::send_vector<uint8_t>(socket, value);
+}
+
+void MasterDaemon::_do_stop(SocketType socket) {
+  ReplyType value = ReplyType::STOP_WAIT;
+  _stop = true;
+  tcputils::send_value<ReplyType>(socket, value);
+}
+
+void MasterDaemon::_do_wait(SocketType socket) {
+  std::string key = tcputils::receive_string(socket);
+  auto iter = _store.find(key);
+  auto reply = ReplyType::STOP_WAIT;
+  if (iter == _store.end()) {
+    reply = ReplyType::WAITING;
+  }
+  VLOG(3) << "TCPStore: wait reply (" << static_cast<int>(reply)
+          << ") for key (" << key << ").";
+  tcputils::send_value<ReplyType>(socket, reply);
+}
+
+void MasterDaemon::run() {
+  std::vector<struct pollfd> fds;
+#ifdef _WIN32
+  fds.push_back({_listen_socket, POLLIN});
+#else
+  fds.push_back({.fd = _listen_socket, .events = POLLIN, .revents = 0});
+#endif
+
+  while (!_stop) {
+    for (size_t i = 0; i < fds.size(); i++) {
+      fds[i].revents = 0;
+    }
+
+#ifdef _WIN32
+    ::WSAPoll(fds.data(), fds.size(), INFTIME);
+#else
+    ::poll(fds.data(), fds.size(), INFTIME);
+#endif
+
+    if (fds[0].revents != 0) {
+      auto socket = tcputils::tcp_accept(_listen_socket);
+      _sockets.emplace_back(socket);
+#ifdef _WIN32
+      fds.push_back({socket, POLLIN});
+#else
+      fds.push_back({.fd = socket, .events = POLLIN, .revents = 0});
+#endif
+    }
+
+    for (size_t i = 1; i < fds.size(); i++) {
+      if (fds[i].revents == 0) {
+        continue;
+      }
+
+      Command command = tcputils::receive_value<Command>(fds[i].fd);
+      VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command) << ".";
+
+      switch (command) {
+        case Command::ADD:
+          _do_add(fds[i].fd);
+          break;
+        case Command::GET:
+          _do_get(fds[i].fd);
+          break;
+        case Command::WAIT:
+          _do_wait(fds[i].fd);
+          break;
+        case Command::STOP:
+          _do_stop(fds[i].fd);
+          break;
+      }
+    }
+  }
+}
+
+std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) {
+  int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
+  auto server = std::make_unique<TCPServer>();
+  server->_master_daemon = MasterDaemon::start(socket);
+  return server;
+}
+
+std::unique_ptr<TCPClient> TCPClient::connect(const std::string host,
+                                              uint16_t port) {
+  int socket = tcputils::tcp_connect(host, std::to_string(port), AF_INET);
+  return std::make_unique<TCPClient>(socket);
+}
+
+void TCPClient::send_command_for_key(Command type, const std::string& key) {
+  tcputils::send_value<Command>(_socket, type);
+  if (key.empty()) {
+    return;
+  }
+  tcputils::send_string(_socket, key);
+}
+
+template <typename T>
+void TCPClient::send_value(const T& value) {
+  tcputils::send_bytes<T>(_socket, &value, 1);
+}
+
+template <typename T>
+T TCPClient::receive_value() {
+  T res;
+  tcputils::receive_bytes<T>(_socket, &res, 1);
+  return res;
+}
+
+template <typename T>
+void TCPClient::send_vector(const std::vector<T>& value) {
+  tcputils::send_vector<T>(_socket, value);
+}
+
+template <typename T>
+std::vector<T> TCPClient::receive_vector() {
+  return tcputils::receive_vector<T>(_socket);
+}
+
+}  // namespace detail
+
+TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
+                   size_t num_workers, std::chrono::seconds timeout)
+    : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
+  if (_is_master) {
+    _server = detail::TCPServer::create(port);
+  }
+
+  _client = detail::TCPClient::connect(host, port);
+  waitWorkers();
+}
+
+void TCPStore::waitWorkers() {
+  if (_num_workers == 0) {
+    return;
+  }
+  add(_init_key, 1);
+
+  if (_server) {
+    auto begin = std::chrono::steady_clock::now();
+    do {
+      auto value = get(_init_key);
+      int completed = std::stoi(std::string(value.begin(), value.end()));
+      VLOG(3) << completed << " worker ready, total " << _num_workers;
+      if (completed >= _num_workers) {
+        break;
+      }
+      const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+          std::chrono::steady_clock::now() - begin);
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
+        PADDLE_ENFORCE_EQ(
+            completed, _num_workers,
+            platform::errors::InvalidArgument(
+                "TCPStore timeouted and not all workers got ready."));
+      }
+    } while (true);
+  }
+  VLOG(3) << "TCPStore initialized.";
+}
+
+int64_t TCPStore::add(const std::string& key, int64_t value) {
+  _client->send_command_for_key(Command::ADD, _key_prefix + key);
+  _client->send_value<std::int64_t>(value);
+  return _client->receive_value<std::int64_t>();
+}
+
+std::vector<uint8_t> TCPStore::get(const std::string& key) {
+  wait(key);
+  _client->send_command_for_key(Command::GET, _key_prefix + key);
+  VLOG(3) << "TCPStore get.";
+  return _client->receive_vector<uint8_t>();
+}
+
+void TCPStore::wait(const std::string& key) {
+  ReplyType reply;
+  do {
+    _client->send_command_for_key(Command::WAIT, _key_prefix + key);
+
+    reply = _client->receive_value<ReplyType>();
+    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+  } while (reply != ReplyType::STOP_WAIT);
+}
+
+TCPStore::~TCPStore() {
+  _client->send_command_for_key(Command::STOP, "");
+  ReplyType ret = _client->receive_value<ReplyType>();
+  PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
+                    platform::errors::InvalidArgument(
+                        "The reply for TCPStore destructure must be 0."));
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd706dd6640acf5e0b5b3714175dac7a6cecb25a
--- /dev/null
+++ b/paddle/fluid/distributed/store/tcp_store.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/distributed/store/tcp_utils.h"
+
+namespace paddle {
+namespace distributed {
+
+enum class ReplyType { WAITING, STOP_WAIT };
+enum class Command { ADD, GET, WAIT, STOP };
+
+namespace detail {
+
+class MasterDaemon {
+ public:
+  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket);
+  MasterDaemon() = delete;
+  explicit MasterDaemon(SocketType listen_socket);
+  ~MasterDaemon();
+
+ private:
+  void run();
+  void _do_add(SocketType socket);
+  void _do_wait(SocketType socket);
+  void _do_get(SocketType socket);
+  void _do_stop(SocketType socket);
+  SocketType _listen_socket;
+  std::vector<SocketType> _sockets;
+  std::unordered_map<std::string, std::vector<uint8_t>> _store;
+  std::thread _background_thread{};
+  bool _stop = false;
+};
+
+class TCPServer {
+ public:
+  TCPServer() = default;
+  static std::unique_ptr<TCPServer> create(std::uint16_t port);
+
+ private:
+  std::unique_ptr<MasterDaemon> _master_daemon;
+};
+
+class TCPClient {
+ public:
+  explicit TCPClient(SocketType socket) : _socket{socket} {}
+  static std::unique_ptr<TCPClient> connect(const std::string host,
+                                            uint16_t port);
+  ~TCPClient() { tcputils::close_socket(_socket); }
+  void send_command_for_key(Command type, const std::string& key);
+
+  template <typename T>
+  void send_value(const T& value);
+
+  template <typename T>
+  void send_vector(const std::vector<T>& value);
+  template <typename T>
+  std::vector<T> receive_vector();
+
+  template <typename T>
+  T receive_value();
+
+ private:
+  SocketType _socket;
+};
+
+}  // namespace detail
+
+class TCPStore : public Store {
+ public:
+  static constexpr std::uint16_t kDefaultPort = 6170;
+  explicit TCPStore(std::string host, uint16_t port = kDefaultPort,
+                    bool is_master = false, size_t num_workers = 1,
+                    std::chrono::seconds timeout = tcputils::kDefaultTimeout);
+
+  ~TCPStore();
+
+  int64_t add(const std::string& key, int64_t value) override;
+  std::vector<uint8_t> get(const std::string& key) override;
+  void wait(const std::string& key) override;
+
+ private:
+  void waitWorkers();
+  std::unique_ptr<detail::TCPServer> _server;
+  std::unique_ptr<detail::TCPClient> _client;
+
+  const std::string _init_key = "init/";
+  const std::string _key_prefix = "/";
+  std::chrono::seconds _timeout;
+  bool _is_master;
+  int _num_workers;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0561d0b9a9c5b01c32620e72d21ed562e42637e
--- /dev/null
+++ b/paddle/fluid/distributed/store/tcp_utils.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/store/tcp_utils.h"
+#include <cerrno>
+#include <cstring>
+#include <thread>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+namespace tcputils {
+
+std::error_code socket_error() {
+#ifdef _WIN32
+  return std::error_code{::WSAGetLastError(), std::generic_category()};
+#else
+  return std::error_code{errno, std::generic_category()};
+#endif
+}
+
+void close_socket(SocketType socket) {
+#ifdef _WIN32
+  ::closesocket(socket);
+#else
+  ::close(socket);
+#endif
+}
+
+::addrinfo* get_addr_info(const std::string host, const std::string port,
+                          int ai_flags, int family) {
+  ::addrinfo hints{}, *res;
+  hints.ai_flags = ai_flags;
+  hints.ai_family = family;
+  hints.ai_socktype = SOCK_STREAM;
+
+  const char* node = host.empty() ? nullptr : host.c_str();
+
+  int n;
+  n = ::getaddrinfo(node, port.c_str(), &hints, &res);
+  const char* gai_err = ::gai_strerror(n);
+  const char* proto =
+      (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
+  PADDLE_ENFORCE_EQ(
+      n, 0, platform::errors::InvalidArgument(
+                "%s network %s:%s cannot be obtained. Details: %s.", proto,
+                host, port, gai_err));
+
+  return res;
+}
+
+void free_addr_info(::addrinfo* hint) {
+  PADDLE_ENFORCE_NOT_NULL(
+      hint, platform::errors::InvalidArgument(
+                "The parameter for free_addr_info cannot be null."));
+  ::freeaddrinfo(hint);
+}
+
+SocketType tcp_connect(const std::string host, const std::string port,
+                       int family, std::chrono::seconds timeout) {
+  int ai_flags = AI_NUMERICSERV | AI_V4MAPPED | AI_ALL;
+  ::addrinfo* res = get_addr_info(host, port, ai_flags, family);
+
+  SocketType sockfd = -1;
+  bool retry = true;
+  auto deadline = std::chrono::steady_clock::now() + timeout;
+  do {
+    for (::addrinfo* cur = res; cur != nullptr; cur = cur->ai_next) {
+      sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+      PADDLE_ENFORCE_GT(sockfd, 0, platform::errors::InvalidArgument(
+                                       "Create socket to connect %s:%s failed. "
+                                       "Details: %s. ",
+                                       host, port, socket_error().message()));
+
+      if (::connect(sockfd, cur->ai_addr, cur->ai_addrlen) == 0) {
+        retry = false;
+        break;
+      }
+      VLOG(0) << "Retry to connect to " << host << ":" << port
+              << " while the server is not yet listening.";
+      close_socket(sockfd);
+      sockfd = -1;
+      std::this_thread::sleep_for(kDelay);
+      if (timeout != kNoTimeout &&
+          std::chrono::steady_clock::now() >= deadline) {
+        retry = false;
+        break;
+      }
+    }
+
+    if (timeout != kNoTimeout && std::chrono::steady_clock::now() >= deadline) {
+      retry = false;
+    }
+  } while (retry);
+
+  free_addr_info(res);
+
+  PADDLE_ENFORCE_GT(sockfd, 0,
+                    platform::errors::InvalidArgument(
+                        "Network %s:%s cannot be connected.", host, port));
+  VLOG(0) << "Successfully connected to " << host << ":" << port;
+
+  return sockfd;
+}
+
+SocketType tcp_listen(const std::string host, const std::string port,
+                      int family) {
+  int ai_flags = AI_PASSIVE | AI_NUMERICSERV;
+  ::addrinfo* res = get_addr_info(host, port, ai_flags, family);
+  ::addrinfo* cur = res;
+  SocketType sockfd{};
+
+  std::string node = host.empty() ? "IP_ANY" : host;
+  while (cur) {
+    sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (sockfd < 0) {
+      VLOG(0) << "Cannot create socket on " << node << ":" << port
+              << ". Details: " << socket_error().message();
+      cur = cur->ai_next;
+      continue;
+    }
+
+    int on = 1;
+#ifdef _WIN32
+    int ret = ::setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR,
+                           reinterpret_cast<char*>(&on), sizeof(on));
+#else
+    int ret = ::setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+#endif
+    if (ret < 0) {
+      VLOG(0) << "Set the address reuse option failed on the server.";
+    }
+    if (::bind(sockfd, res->ai_addr, res->ai_addrlen) == 0) {
+      break;
+    }
+    close_socket(sockfd);
+    sockfd = -1;
+    cur = cur->ai_next;
+  }
+
+  PADDLE_ENFORCE_GT(sockfd, 0,
+                    platform::errors::InvalidArgument(
+                        "Bind network on %s:%s failedd.", node, port));
+
+  ::listen(sockfd, LISTENQ);
+
+  VLOG(0) << "The server starts to listen on " << node << ":" << port;
+  return sockfd;
+}
+
+SocketType tcp_accept(SocketType socket) {
+  ::sockaddr_storage addr_s{};
+  ::socklen_t addr_len = sizeof(addr_s);
+  SocketType new_socket =
+      ::accept(socket, reinterpret_cast<::sockaddr*>(&addr_s), &addr_len);
+  PADDLE_ENFORCE_GT(
+      new_socket, 0,
+      platform::errors::InvalidArgument(
+          "The server failed to accept a new connection. Details: %s.",
+          socket_error().message()));
+#ifndef _WIN32
+  ::fcntl(new_socket, F_SETFD, FD_CLOEXEC);
+#endif
+  auto value = 1;
+#ifdef _WIN32
+  ::setsockopt(new_socket, IPPROTO_TCP, TCP_NODELAY,
+               reinterpret_cast<const char*>(&value), sizeof(value));
+#else
+  ::setsockopt(new_socket, IPPROTO_TCP, TCP_NODELAY, &value, sizeof(value));
+#endif
+  return new_socket;
+}
+
+void send_string(SocketType socket, const std::string& s) {
+  std::string::size_type size = s.size();
+  send_bytes<std::string::size_type>(socket, &size, 1);
+  send_bytes<const char>(socket, s.data(), size);
+}
+
+std::string receive_string(SocketType socket) {
+  std::string::size_type size;
+  receive_bytes<std::string::size_type>(socket, &size, 1);
+  std::vector<char> v(size);
+  receive_bytes<char>(socket, v.data(), size);
+  return std::string(v.data(), v.size());
+}
+
+}  // namespace tcputils
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/tcp_utils.h b/paddle/fluid/distributed/store/tcp_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..60cb3de124da3593f3d07ffadcf3b12c2deedf29
--- /dev/null
+++ b/paddle/fluid/distributed/store/tcp_utils.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#pragma comment(lib, "Ws2_32.lib")
+#else
+#include <fcntl.h>
+#include <netdb.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+#include <chrono>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+// Utility functions for TCP socket.
+namespace paddle {
+namespace distributed {
+
+#ifdef _WIN32
+using SocketType = SOCKET;
+#else
+using SocketType = int;
+#endif
+
+namespace tcputils {
+
+constexpr int LISTENQ = 2048;
+constexpr std::chrono::seconds kDelay = std::chrono::seconds(3);
+constexpr std::chrono::seconds kNoTimeout = std::chrono::seconds::zero();
+constexpr std::chrono::seconds kDefaultTimeout = std::chrono::seconds(360);
+
+std::error_code socket_error();
+void close_socket(SocketType socket);
+::addrinfo* get_addr_info(const std::string host, const std::string port,
+                          int ai_flags, int family);
+void free_addr_info(::addrinfo*);
+SocketType tcp_connect(const std::string host, const std::string port,
+                       int family, std::chrono::seconds timeout = kNoTimeout);
+SocketType tcp_listen(const std::string host, const std::string port,
+                      int family);
+SocketType tcp_accept(SocketType socket);
+
+void send_string(SocketType socket, const std::string& s);
+std::string receive_string(SocketType socket);
+
+template <typename T>
+void send_bytes(SocketType socket, const T* buffer, size_t len) {
+  size_t to_send = len * sizeof(T);
+  if (to_send == 0) {
+    return;
+  }
+
+  auto ptr = reinterpret_cast<const char*>(buffer);
+
+  while (to_send > 0) {
+    auto byte_sent = ::send(socket, ptr, to_send, 0);
+    PADDLE_ENFORCE_GT(byte_sent, 0, platform::errors::InvalidArgument(
+                                        "TCP send error. Details: %s.",
+                                        socket_error().message()));
+    to_send -= byte_sent;
+    ptr += byte_sent;
+  }
+}
+
+template <typename T>
+void receive_bytes(SocketType socket, T* buffer, size_t len) {
+  size_t to_recv = len * sizeof(T);
+  if (to_recv == 0) {
+    return;
+  }
+  auto ptr = reinterpret_cast<char*>(buffer);
+
+  while (to_recv > 0) {
+    auto byte_received = ::recv(socket, ptr, to_recv, 0);
+    PADDLE_ENFORCE_GT(byte_received, 0, platform::errors::InvalidArgument(
+                                            "TCP receive error. Details: %s.",
+                                            socket_error().message()));
+
+    to_recv -= byte_received;
+    ptr += byte_received;
+  }
+}
+
+template <typename T>
+void send_vector(SocketType socket, const std::vector<T>& v) {
+  size_t size = v.size();
+  send_bytes<size_t>(socket, &size, 1);
+  send_bytes<T>(socket, v.data(), size);
+}
+
+template <typename T>
+std::vector<T> receive_vector(SocketType socket) {
+  size_t size;
+  receive_bytes<size_t>(socket, &size, 1);
+  std::vector<T> res(size);
+  receive_bytes<T>(socket, res.data(), size);
+  return res;
+}
+
+template <typename T>
+void send_value(SocketType socket, const T& v) {
+  send_bytes<T>(socket, &v, 1);
+}
+
+template <typename T>
+T receive_value(SocketType socket) {
+  T v;
+  receive_bytes<T>(socket, &v, 1);
+  return v;
+}
+
+}  // namespace tcputils
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 711c46e995286e7369a42738a14eae86605a3e79..5e16ab2b391d0223a8b6fd9bae78cced9d4e2f11 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
+set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps dygraph_function dygraph_node)
 
@@ -10,11 +10,11 @@ endif()
 add_subdirectory(api)
 add_subdirectory(accumulation)
 
-cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
+cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
 
-cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api)
-cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
+cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api)
+cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
 cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/eager/accumulation/CMakeLists.txt b/paddle/fluid/eager/accumulation/CMakeLists.txt
index 632e289ba230871fd5630d674767b32d9f7b8b3f..43ca707f4f6fbe76c234318d19791d512eb3152b 100644
--- a/paddle/fluid/eager/accumulation/CMakeLists.txt
+++ b/paddle/fluid/eager/accumulation/CMakeLists.txt
@@ -1 +1 @@
-cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator pten pten_api grad_node_info)
+cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator phi phi_api grad_node_info)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 8b6752dfec743d774c8656d36421f31e7dab9799..3a2ec403c0a59aaa23decc72fb9581b5a7f78343 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -25,6 +25,8 @@
 
 #include "glog/logging.h"
 
+namespace egr {
+
 static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
                             const paddle::experimental::Tensor& t) {
   if (!tensor->defined() || !tensor->initialized()) {
@@ -36,17 +38,10 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
   }
 }
 
-namespace egr {
-
-void GradNodeAccumulation::RetainGrad(
-    const std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook) {
-  retain_grad_hook_ = hook;
-}
-
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
 operator()(
     const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+  VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
                      "GradNodeAccumulation should take exactly 1 grad tensor"
@@ -58,17 +53,18 @@ operator()(
                      "However received: %d in slot %d .",
                      grads[0].size(), 0));
   // Apply Gradient Hooks
+  paddle::experimental::Tensor grad_out;
   if (GradientHooksRegistered()) {
     std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
         ApplyGradientHooks(grads);
-    // TODO(jiabin): It's little weird
-    CopyOrAddTensor(&accumulated_grad, hooked_grads[0][0]);
+    grad_out = hooked_grads[0][0];
   } else {
-    CopyOrAddTensor(&accumulated_grad, grads[0][0]);
+    grad_out = grads[0][0];
   }
 
-  if (retain_grad_hook_ != nullptr) {
-    retain_grad_hook_(accumulated_grad);
+  if (!weak_grad_.expired()) {
+    auto grad = weak_grad_.lock();
+    CopyOrAddTensor(grad.get(), grad_out);
   }
 
   // Apply Reduce Hooks
@@ -76,17 +72,17 @@ operator()(
     ApplyReduceHooks();
   }
 
-  return {{accumulated_grad}};
+  return {{grad_out}};
 }
 
 void GradNodeAccumulation::RegisterReduceHook(
-    const std::function<void(void)>& hook) {
-  reduce_hooks_.emplace_back(hook);
+    std::shared_ptr<TensorVoidHook>&& hook) {
+  reduce_hooks_.emplace_back(std::move(hook));
 }
 
 void GradNodeAccumulation::ApplyReduceHooks() {
   for (auto& hook : reduce_hooks_) {
-    hook();
+    (*hook)();
   }
 }
 }  // namespace egr
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index be2ccc263e806d8874b0e18f93376bd62745940c..734cabdc3dc914349e2ad30b657bfb6542a7472a 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -14,14 +14,19 @@
 
 #pragma once
 
+#include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 
 namespace egr {
 
 class GradNodeAccumulation : public GradNodeBase {
  public:
   // Constructor: configure fwd input tensors to grad node
-  GradNodeAccumulation() : GradNodeBase(1, 1) { SetDefaultGradInOutMeta(); }
+  explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
+    weak_grad_ = meta->WeakGrad();
+    SetDefaultGradInOutMeta();
+  }
 
   ~GradNodeAccumulation() override = default;
 
@@ -30,15 +35,12 @@ class GradNodeAccumulation : public GradNodeBase {
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
       override;
 
-  void RetainGrad(const std::function<paddle::experimental::Tensor(
-                      const paddle::experimental::Tensor&)>& hook);
-
-  paddle::experimental::Tensor* Grad() { return &accumulated_grad; }
+  std::string name() { return "GradNodeAccumulation"; }
 
   /**
    * Register ReduceHook
    * **/
-  void RegisterReduceHook(const std::function<void(void)>& hook);
+  void RegisterReduceHook(std::shared_ptr<TensorVoidHook>&& hook);
 
   /**
    * Apply ReduceHook here
@@ -47,13 +49,13 @@ class GradNodeAccumulation : public GradNodeBase {
   void ApplyReduceHooks();
 
  private:
-  paddle::experimental::Tensor accumulated_grad;
+  std::weak_ptr<paddle::experimental::Tensor> weak_grad_;
 
   std::function<paddle::experimental::Tensor(
       const paddle::experimental::Tensor&)>
       retain_grad_hook_;
 
-  std::vector<std::function<void(void)>> reduce_hooks_;
+  std::vector<std::shared_ptr<TensorVoidHook>> reduce_hooks_;
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
index e3fafb265ad9887a5683542d79ae07f30edee910..77d8ec57efcaa6c4e83a69f4b2a97b128b174389 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(scale_node SRCS scale_node.cc DEPS global_utils pten pten_api grad_node_info)
+cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info)
 
 if(NOT ON_INFER)
 cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps})
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 3dbfba0d9150f64afd1002fcf7f3e9365bf786d1..5a2595b9103e4d49845fa8938ee3577b6b3f3f06 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -33,36 +33,36 @@ static void ScaleDeviceDispatch(const phi::DenseTensor& dense_tensor,
                                 phi::DenseTensor* dense_out) {
   switch (dense_tensor.dtype()) {
     case phi::DataType::FLOAT64: {
-      phi::ScaleKernel<double, typename paddle::framework::ConvertToPtenContext<
+      phi::ScaleKernel<double, typename paddle::framework::ConvertToPhiContext<
                                    DeviceContext>::TYPE>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
           bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
       break;
     }
     case phi::DataType::FLOAT32: {
-      phi::ScaleKernel<float, typename paddle::framework::ConvertToPtenContext<
+      phi::ScaleKernel<float, typename paddle::framework::ConvertToPhiContext<
                                   DeviceContext>::TYPE>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
           bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
       break;
     }
     case phi::DataType::INT64: {
-      phi::ScaleKernel<int64_t, typename paddle::framework::
-                                    ConvertToPtenContext<DeviceContext>::TYPE>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+      phi::ScaleKernel<int64_t, typename paddle::framework::ConvertToPhiContext<
+                                    DeviceContext>::TYPE>(
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
           bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
       break;
     }
     case phi::DataType::INT32: {
-      phi::ScaleKernel<int32_t, typename paddle::framework::
-                                    ConvertToPtenContext<DeviceContext>::TYPE>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+      phi::ScaleKernel<int32_t, typename paddle::framework::ConvertToPhiContext<
+                                    DeviceContext>::TYPE>(
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
           bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
index 8ede139ddc0446ddab4404ae2f749a3c84748d73..60b35340eabd1fa03f59cc0b7ea278351be96df1 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(eager_scale SRCS scale.cc DEPS pten_api pten autograd_meta scale_node)
+cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node)
 
 if(NOT ON_INFER)
 cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps})
diff --git a/paddle/fluid/eager/api/utils/CMakeLists.txt b/paddle/fluid/eager/api/utils/CMakeLists.txt
index 3a4f0ba320358ed1dbd0a493f7263aeae5633f87..c34df3972c23e14b8f15517d86091ccc2ae6d0fc 100644
--- a/paddle/fluid/eager/api/utils/CMakeLists.txt
+++ b/paddle/fluid/eager/api/utils/CMakeLists.txt
@@ -1,3 +1,3 @@
-cc_library(tensor_utils SRCS tensor_utils.cc DEPS pten pten_api autograd_meta grad_node_info accumulation_node)
-cc_library(hook_utils SRCS hook_utils.cc DEPS pten tensor_utils autograd_meta grad_node_info utils accumulation_node)
+cc_library(tensor_utils SRCS tensor_utils.cc DEPS phi phi_api autograd_meta grad_node_info accumulation_node)
+cc_library(hook_utils SRCS hook_utils.cc DEPS phi tensor_utils autograd_meta grad_node_info utils accumulation_node)
 cc_library(global_utils SRCS global_utils.cc DEPS place tracer)
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index ee6a3afc6ffd39e264b039adaf8bae716c9e483e..c7927716300528fdfa571de720ce12e7246b5f1d 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -22,19 +22,19 @@
 namespace egr {
 namespace egr_utils_api {
 
-void RegisterGradientHookForTensor(
+int64_t RegisterGradientHookForTensor(
     const paddle::experimental::Tensor& tensor,
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook) {
+    std::shared_ptr<egr::TensorHook>&& hook) {
   // Find grad_node and out_rank from AutogradMeta
   std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
   auto rank_info = EagerUtils::unsafe_autograd_meta(tensor)->OutRankInfo();
 
-  grad_node->RegisterGradientHook(rank_info.first, rank_info.second, hook);
+  return grad_node->RegisterGradientHook(rank_info.first, rank_info.second,
+                                         std::move(hook));
 }
 
 void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
-                                 const std::function<void(void)>& hook) {
+                                 std::shared_ptr<egr::TensorVoidHook>&& hook) {
   if (IsLeafTensor(tensor)) {
     VLOG(6) << "Register ReduceHook for leaf tensor";
     std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
@@ -45,59 +45,56 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
                                         "with type: GradNodeAccumulation"));
     auto accumulation_grad_node =
         std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node);
-    accumulation_grad_node->RegisterReduceHook(hook);
+    accumulation_grad_node->RegisterReduceHook(std::move(hook));
   } else {
     PADDLE_THROW(paddle::platform::errors::Fatal(
         "Only can register reduce hook for leaf Tensor."));
   }
 }
 
-void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
-  // TODO(jiabin): Support More Tensor type here
+static void RetainGradForRegularNode(
+    const paddle::experimental::Tensor& tensor) {
   AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
+  if (meta->RetainGrads()) {
+    return;
+  } else {
+    meta->SetRetainGrads(true);
+  }
+
   std::weak_ptr<paddle::experimental::Tensor> weak_grad_tensor =
       meta->WeakGrad();
 
   // Define Hook
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
-        if (!weak_grad_tensor.expired()) {
-          auto grad_tensor = weak_grad_tensor.lock();
-          if (t.defined()) {
-            VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
-            // Simply Copy impl() to grad_tensor
-            grad_tensor->set_impl(t.impl());
-            return *grad_tensor.get();
-          } else {
-            PADDLE_THROW(paddle::platform::errors::Fatal(
-                "Detected uninitialized variable, causing segmentation "
-                "fault "
-                "inside the hook."
-                "Tensor has to be initialized while we need to set it."
-                "please check tensor initialization status."));
-          }
-        } else {
-          VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
-          return paddle::experimental::Tensor();
-        }
-      };
+  auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
+    if (!weak_grad_tensor.expired()) {
+      auto grad_tensor = weak_grad_tensor.lock();
+      if (t.defined()) {
+        VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
+        // Simply Copy impl() to grad_tensor
+        grad_tensor->set_impl(t.impl());
+        return *grad_tensor.get();
+      } else {
+        VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
+        return paddle::experimental::Tensor();
+      }
+    } else {
+      VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
+      return paddle::experimental::Tensor();
+    }
+  };
 
-  if (IsLeafTensor(tensor)) {
-    // Add RetainGrad as PostHook to AccumulationNode
-    std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
-    PADDLE_ENFORCE(
-        grad_node.get() != nullptr,
-        paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                        "Leaf tensor should have had grad_node "
-                                        "with type: GradNodeAccumulation"));
-    auto accumulation_grad_node =
-        std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node);
-    accumulation_grad_node->RetainGrad(hook);
+  // Append to GradientHooks
+  RegisterGradientHookForTensor(tensor,
+                                std::make_shared<egr::CppTensorHook>(hook));
+}
 
+void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
+  if (IsLeafTensor(tensor)) {
+    // Leaf tensor's grad will always be retained
+    // Refer to implementation of AccumulationNode for more details
+    return;
   } else {
-    // Append to GradientHooks
-    RegisterGradientHookForTensor(tensor, hook);
+    RetainGradForRegularNode(tensor);
   }
 }
 
diff --git a/paddle/fluid/eager/api/utils/hook_utils.h b/paddle/fluid/eager/api/utils/hook_utils.h
index 4c4ecc9fb801de67778cb7209a721dee3572bdf0..b36ef81125a8ca5ef1f2720b73021ae82395a9aa 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.h
+++ b/paddle/fluid/eager/api/utils/hook_utils.h
@@ -16,17 +16,17 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/api/all.h"
 namespace egr {
 namespace egr_utils_api {
 
-void RegisterGradientHookForTensor(
+int64_t RegisterGradientHookForTensor(
     const paddle::experimental::Tensor& tensor,
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook);
+    std::shared_ptr<egr::TensorHook>&& hook);
 
 void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
-                                 const std::function<void(void)>& hook);
+                                 std::shared_ptr<egr::TensorVoidHook>&& hook);
 void RetainGradForTensor(const paddle::experimental::Tensor& tensor);
 
 }  // namespace egr_utils_api
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index c06edef7017be133c1cbb1e92e71390b9ab38e74..77c39d1b0a37c3946e4c170484118a5fb6f79170 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -22,7 +22,7 @@
 #include "paddle/phi/api/all.h"
 
 #include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace egr {
@@ -43,11 +43,11 @@ paddle::experimental::Tensor CreateTensorWithValue(
     bool is_leaf) {
   paddle::experimental::Tensor out = paddle::experimental::full(
       phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype,
-      phi::TransToPtenBackend(place));
+      phi::TransToPhiBackend(place));
 
   auto meta = EagerUtils::autograd_meta(&out);
   if (is_leaf) {
-    auto accumulation_node = std::make_shared<GradNodeAccumulation>();
+    auto accumulation_node = std::make_shared<GradNodeAccumulation>(meta);
     meta->SetGradNode(accumulation_node);
     meta->SetStopGradient(false);
   }
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 63f25f5528100cccdacf7c1c1ca67095b448160c..a8e0ed7a41a043e12332ad347f673a6c27e5f1ec 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 #define NUM_CREATED_DUP_INPUTS 4
@@ -544,7 +544,7 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
   // since only OperatorWithKernel can run in dygraph mode.
   auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
   if (!all_kernels.count(op_type) &&
-      !phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+      !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) {
     return false;
   }
 
@@ -554,6 +554,21 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
   return true;
 }
 
+static bool BeSameAsInput(const std::string& output_name,
+                          const std::set<std::string>& input_names) {
+  if (output_name.size() < 4) {
+    return false;
+  }
+
+  if (output_name.substr(output_name.size() - 3, 3) == "Out") {
+    if (input_names.count(output_name.substr(0, output_name.size() - 3))) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /* --------------------------------------- */
 /* --------- Preprocess Ins/Outs --------- */
 /* --------------------------------------- */
@@ -1016,33 +1031,20 @@ static std::string GenerateGradNodeCreationContent(
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
 
+    // Skip Intermediate Tensor
+
     if (output.duplicable()) {
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
           "  std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::autograd_meta(&%s);\n";
       get_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-      if (op_passing_outs_map[op_type].count(output_name)) {
-        const std::string output_var_args_name = output_name + "Var";
-        const char* FWD_OUT_SYNC_BACK_TEMPLATE =
-            "  egr::EagerUtils::OverwriteOutputs(%s, %s);\n";
-        get_autograd_meta_str += paddle::string::Sprintf(
-            FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
-      }
     } else {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::autograd_meta(&%s);\n";
       get_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-
-      if (op_passing_outs_map[op_type].count(output_name)) {
-        const std::string output_var_args_name = output_name + "Var";
-        const char* FWD_OUT_SYNC_BACK_TEMPLATE =
-            "  egr::EagerUtils::OverwriteOutputs(%s, %s);\n";
-        get_autograd_meta_str += paddle::string::Sprintf(
-            FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
-      }
     }
   }
   VLOG(6) << "Generated outputs autograd_meta";
@@ -1145,6 +1147,8 @@ static std::string GenerateGradNodeCreationContent(
     const std::string& output_autograd_name = "p_autograd_" + output_name;
     size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
+    // Intermediate Tensor does not require SetHistory, nor RetainGrad
+
     if (output.duplicable()) {
       pass_stop_gradient_args += ", &" + output_autograd_name;
       const char* SET_OUT_RANK_TEMPLATE =
@@ -1180,11 +1184,13 @@ static std::string GenerateGradNodeCreationContent(
           SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
     }
 
-    VLOG(6) << "Generated Call RetainGradForTensor";
-    const char* RETAIN_GRAD_TEMPLATE =
-        "    egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-    grad_node_creation_str +=
-        paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
+    if (!output.intermediate()) {
+      VLOG(6) << "Generated Call RetainGradForTensor";
+      const char* RETAIN_GRAD_TEMPLATE =
+          "    egr::EagerUtils::CheckAndRetainGrad(%s);\n";
+      grad_node_creation_str +=
+          paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
+    }
   }
   VLOG(6) << "Generated SetGradIn/OutMeta";
 
@@ -1324,19 +1330,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   generated_function_body += "\n";
 
   // Handle Dispensable Inputs
+  std::set<std::string> input_names;
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
+    input_names.insert(input_name);
     if (input.dispensable()) {
       if (input.duplicable()) {
         const char* FWD_INS_CONTENT_TEMPLATE =
             "  if(%s.size() > 0) "
-            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s)\n;";
+            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
         generated_function_body += paddle::string::Sprintf(
             FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
       } else {
         const char* FWD_INS_CONTENT_TEMPLATE =
             "  if(%s.initialized()) "
-            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s)\n;";
+            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
         generated_function_body += paddle::string::Sprintf(
             FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
       }
@@ -1372,11 +1380,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
         core_ops_args_type_info[op_type].push_back("tensor");
       }
-      const char* FWD_OUTS_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
-      outs_contents_str += paddle::string::Sprintf(
-          FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
 
+      if (BeSameAsInput(output_name, input_names)) {
+        if (!output.dispensable()) {
+          std::string input_name =
+              output_name.substr(0, output_name.size() - 3);
+          const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
+          outs_contents_str += paddle::string::Sprintf(
+              FWD_OUTS_CONTENT_TEMPLATE, output_name, input_name);
+        }
+      } else {
+        const char* FWD_OUTS_CONTENT_TEMPLATE =
+            "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
+        outs_contents_str += paddle::string::Sprintf(
+            FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
+      }
       core_ops_args_info[op_type].push_back(output_var_name);
 
     } else {
@@ -1415,6 +1433,23 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   generated_function_body += outs_map_str;
   generated_function_body += "\n";
 
+  for (const proto::OpProto::Var& output : out_vars) {
+    const std::string& output_name = output.name();
+    if (op_passing_outs_map[op_type].count(output_name)) {
+      if (BeSameAsInput(output_name, input_names)) {
+        if (output.dispensable()) {
+          std::string input_name =
+              output_name.substr(0, output_name.size() - 3);
+          const char* FWD_OUTS_CONTENT_TEMPLATE =
+              "  if (ins.count(\"%s\")) outs[\"%s\"] = ins[\"%s\"];\n";
+          generated_function_body += paddle::string::Sprintf(
+              FWD_OUTS_CONTENT_TEMPLATE, input_name, output_name, input_name);
+        }
+      }
+    }
+  }
+  generated_function_body += "\n";
+
   VLOG(6) << "Generated Outs Map";
 
   // [Generation] Get Attrs
@@ -1448,33 +1483,61 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     std::string output_varname = LegalizeVariableName(output_name);
 
     if (output.duplicable()) {
-      const char* FWD_OUT_TENSORS_TEMPLATE =
-          "  std::vector<paddle::experimental::Tensor> %s = "
-          "egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
-      out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
-                                               output_varname, output_name);
+      if (op_passing_outs_map[op_type].count(output_name)) {
+        if (output.dispensable()) {
+          const char* FWD_OUT_TENSORS_TEMPLATE =
+              "  std::vector<paddle::experimental::Tensor> %s;\n"
+              "  if (outs.count(\"%s\"))  "
+              "egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
+              "  egr::EagerUtils::Output2Result(%s, &%s);\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name,
+              output_name, output_var_args_name, output_var_args_name,
+              output_varname);
+        } else {
+          const char* FWD_OUT_TENSORS_TEMPLATE =
+              "  std::vector<paddle::experimental::Tensor> %s;\n"
+              "  egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
+              "  egr::EagerUtils::Output2Result(%s, &%s);\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name,
+              output_var_args_name, output_var_args_name, output_varname);
+        }
+      } else {
+        const char* FWD_OUT_TENSORS_TEMPLATE =
+            "  std::vector<paddle::experimental::Tensor> %s;\n"
+            "  egr::EagerUtils::GetOutputs(outs[\"%s\"], &%s);\n";
+        out_tensor_str =
+            paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, output_varname,
+                                    output_name, output_varname);
+      }
       return_types[return_position] =
           "std::vector<paddle::experimental::Tensor>";
-      if (op_passing_outs_map[op_type].count(output_name) &&
-          bwd_info.GenerateForwardOnly()) {
-        const char* FWD_OUT_SYNC_BACK_TEMPLATE =
-            "  egr::EagerUtils::OverwriteOutputs(outs[\"%s\"], %s);\n";
-        out_tensor_str += paddle::string::Sprintf(
-            FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
-      }
     } else {
-      const char* FWD_OUT_TENSOR_TEMPLATE =
-          "  paddle::experimental::Tensor %s = "
-          "egr::EagerUtils::GetOutput(outs[\"%s\"][0]);\n";
-      out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
-                                               output_varname, output_name);
-
-      if (op_passing_outs_map[op_type].count(output_name) &&
-          bwd_info.GenerateForwardOnly()) {
-        const char* FWD_OUT_SYNC_BACK_TEMPLATE =
-            "  egr::EagerUtils::OverwriteOutputs(outs[\"%s\"][0], %s);\n";
-        out_tensor_str += paddle::string::Sprintf(
-            FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
+      if (op_passing_outs_map[op_type].count(output_name)) {
+        if (output.dispensable()) {
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  if (outs.count(\"%s\"))  "
+              "egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
+              "  paddle::experimental::Tensor& %s = *%s;\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSOR_TEMPLATE, output_name, output_name,
+              output_var_args_name, output_varname, output_var_args_name);
+        } else {
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
+              "  paddle::experimental::Tensor& %s = *%s;\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSOR_TEMPLATE, output_name, output_var_args_name,
+              output_varname, output_var_args_name);
+        }
+      } else {
+        const char* FWD_OUT_TENSOR_TEMPLATE =
+            "  paddle::experimental::Tensor %s;\n"
+            "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
+        out_tensor_str =
+            paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
+                                    output_name, output_varname);
       }
       return_types[return_position] = "paddle::experimental::Tensor";
     }
@@ -1494,6 +1557,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         GenerateGradNodeCreationContent(fwd_info, bwd_info);
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
+
     // [Generation] Call RetainGradForTensor
     VLOG(6) << "Generated GradNode Creation codes";
   }
@@ -1588,12 +1652,25 @@ static std::string GenerateSingleOpBase(
   const std::string& attrs_name = "attrs_map" + std::to_string(*outs_size);
 
   // [Generation] Get Ins Map
+  std::unordered_set<std::string> dispensable_input_name_set;
+  for (const auto& in : in_vars) {
+    if (in.dispensable()) dispensable_input_name_set.insert(in.name());
+  }
+  std::unordered_set<std::string> duplicable_input_name_set;
+  for (const auto& in : in_vars) {
+    if (in.duplicable()) duplicable_input_name_set.insert(in.name());
+  }
   std::string ins_contents_str = "";
   for (auto iter : grad_ins) {
     const std::string& grad_input_name = iter.first;
 
     if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
       // Fwd Tensor
+      const std::string& fwd_name =
+          grad_ins_fwd_slotname_map.at(grad_input_name);
+      if (dispensable_input_name_set.count(fwd_name)) {
+        continue;
+      }
       std::string struct_fwd_input_name =
           grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
       const char* GRAD_INS_FWD_CONTENT_TEMPLATE =
@@ -1634,14 +1711,41 @@ static std::string GenerateSingleOpBase(
       paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
   generated_grad_function_body += ins_map_str;
 
-  VLOG(6) << "Generated Ins Map";
+  for (auto iter : grad_ins) {
+    const std::string& grad_input_name = iter.first;
 
-  // [Generation] Get Outs Map
-  std::unordered_set<std::string> duplicable_input_name_set;
-  for (const auto& in : in_vars) {
-    if (in.duplicable()) duplicable_input_name_set.insert(in.name());
+    if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
+      // Fwd Tensor
+      const std::string& fwd_name =
+          grad_ins_fwd_slotname_map.at(grad_input_name);
+      if (dispensable_input_name_set.count(fwd_name)) {
+        std::string struct_fwd_input_name =
+            grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
+        if (duplicable_input_name_set.count(fwd_name)) {
+          const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
+              "  if(this->%s.size() > 0) %s[\"%s\"] = "
+              "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::"
+              "RecoverTensorWrapper(&this->%s, nullptr));\n";
+          generated_grad_function_body += paddle::string::Sprintf(
+              DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name,
+              ins_name, grad_input_name, struct_fwd_input_name);
+        } else {
+          const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
+              "  auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s, "
+              "nullptr);\n  if(%s.initialized()) %s[\"%s\"] = "
+              "egr::EagerUtils::TrySyncToVars(%s);\n";
+          generated_grad_function_body += paddle::string::Sprintf(
+              DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name,
+              struct_fwd_input_name, grad_input_name, ins_name, grad_input_name,
+              grad_input_name);
+        }
+      }
+    }
   }
 
+  VLOG(6) << "Generated Ins Map";
+
+  // [Generation] Get Outs Map
   std::string outs_contents_str = "";
   for (auto iter : grad_outs) {
     const std::string& grad_output_name = iter.first;
@@ -1936,12 +2040,13 @@ static std::string GenerateGradNodeCCContents(
 
   const char* BWD_RETURN_TEMPLATE =
       "  std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads = "
-      "egr::GradNodeBase::ApplyGradientHooks(grads);\n"
+      "GradNode%s::ApplyGradientHooks(grads);\n"
       "  std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
       "  %s\n"
       "  return outputs;\n";
-  generated_grad_function_body = paddle::string::Sprintf(
-      BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body);
+  generated_grad_function_body =
+      paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
+                              generated_grad_function_body);
 
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
@@ -1987,6 +2092,7 @@ static std::string GenerateGradNodeHeaderContents(
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
+      "  std::string name() { return \"GradNode%s\"; }\n"
       "\n"
       " private:\n"
       "   // TensorWrappers\n"
@@ -2085,8 +2191,8 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string grad_node_str = paddle::string::Sprintf(
       GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type,
-      set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str,
-      attr_members_str);
+      set_tensor_wrappers_str, set_attr_map_str, op_type,
+      tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
 }
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 786bf21e8c8a13da69b201cf988f291dbee64a73..c6e56e34627a52bc19df7e8d87371811fcec8697 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -127,6 +127,40 @@ def ReadBwdFile(filepath):
 ######################
 ###  Yaml Parsers  ###
 ######################
+def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
+    # intermediate_outputs : [name0, name1, ...]
+    # forward_returns_list : [[ret_name, type, orig_pos], ...]
+    """
+    Check whether intermediate_outputs are positioned
+    at the very end of forward_returns_list
+    """
+
+    intermediate_positions = range(
+        len(forward_returns_list) - len(intermediate_outputs),
+        len(forward_returns_list))
+    for ret_name, _, pos in forward_returns_list:
+        if ret_name in intermediate_outputs:
+            assert pos in intermediate_positions
+
+
+def ParseDispensable(string):
+    # string: "X, Y"
+    return [v.strip() for v in string.split(",")]
+
+
+def ParseIntermediate(string):
+    return [v.strip() for v in string.split(",")]
+
+
+def ParseNoNeedBuffer(string):
+    # string: "x, y"
+    no_need_buffer_set = set()
+    for name in string.split(","):
+        no_need_buffer_set.add(name.strip())
+
+    return no_need_buffer_set
+
+
 def ParseYamlArgs(string):
     # Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y
 
@@ -397,7 +431,7 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,
 
 
 def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
-                            backward_attrs_list):
+                            backward_attrs_list, no_need_buffer_set):
     # Inputs:
     # fwd_api_name = ""
     # backward_fwd_input_map   = { "name" : [type, is_fwd_input, orig_position] ...}
@@ -410,15 +444,20 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     set_tensor_wrapper_methods_str = ""
     tensor_wrapper_members_str = ""
     for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
+        if tname in no_need_buffer_set:
+            no_need_buffer = "true"
+        else:
+            no_need_buffer = "false"
+
         tensor_wrapper_name = GetSavedName(tname)
         if IsPlainTensorType(ttype):
             SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """
    void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{     
-     {} = egr::TensorWrapper({}, full_reserved);
+     {} = egr::TensorWrapper({}, full_reserved, {});
    }}
 """
             set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format(
-                tname, tname, tensor_wrapper_name, tname)
+                tname, tname, tensor_wrapper_name, tname, no_need_buffer)
 
             PLAIN_TENSOR_MEMBER_TEMPLATE = """
    egr::TensorWrapper {};
@@ -430,12 +469,12 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
             SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
    void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
      for(const auto& eager_tensor : {}) {{
-        {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved) );
+        {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) );
      }};
    }}
 """
             set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format(
-                tname, tname, tname, tensor_wrapper_name)
+                tname, tname, tname, tensor_wrapper_name, no_need_buffer)
 
             VECTOR_TENSOR_MEMBER_TEMPLATE = """
    std::vector<egr::TensorWrapper> {};
@@ -562,11 +601,11 @@ std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std:
     return node_definition_str
 
 
-def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
-                              forward_inputs_position_map,
-                              forward_outputs_position_map, forward_attrs_list,
-                              backward_fwd_input_map, backward_grad_input_map,
-                              backward_grad_output_map, backward_attrs_list):
+def GenerateNodeCreationCodes(
+        fwd_api_name, bwd_api_name, forward_inputs_position_map,
+        forward_outputs_position_map, forward_attrs_list,
+        backward_fwd_input_map, backward_grad_input_map,
+        backward_grad_output_map, backward_attrs_list, optional_inputs):
     # fwd_api_name = ""
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -640,10 +679,17 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
     # SetTensorWrappers
     set_tensor_wrappers_list = []
     for name, (_, is_fwd_input, _) in backward_fwd_input_map.items():
+        is_optional = (name in optional_inputs)
         if is_fwd_input:
-            set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
+            if is_optional:
+                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
+            else:
+                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
         else:
-            set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, false);"
+            if is_optional:
+                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);"
+            else:
+                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, false);"
         set_tensor_wrappers_list.append(set_tensor_wrappers)
     set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
 
@@ -732,7 +778,8 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
                               forward_inputs_position_map,
                               forward_outputs_position_map, forward_attrs_list,
                               backward_fwd_input_map, backward_grad_input_map,
-                              backward_grad_output_map, backward_attrs_list):
+                              backward_grad_output_map, backward_attrs_list,
+                              optional_inputs, intermediate_outputs):
     # fwd_api_name = ""
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -741,6 +788,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     # backward_grad_input_map  = { "name" : [type, fwd_position, orig_position] ...}
     # backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
     # backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
+    # optional_inputs = ["name0", ...]
 
     # Get Function Args
     num_inputs = len(forward_attrs_list) + len(forward_inputs_position_map.keys(
@@ -750,17 +798,18 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     inputs_call_list = ["" for i in range(num_inputs)]
     for name, (ttype, pos) in forward_inputs_position_map.items():
         inputs_call_list[pos] = f"{name}"
+        is_optional = (name in optional_inputs)
         if IsPlainTensorType(ttype):
-            inputs_args_definition_list[
-                pos] = f"const paddle::experimental::Tensor& {name}"
-            inputs_args_declaration_list[
-                pos] = f"const paddle::experimental::Tensor& {name}"
+            if is_optional:
+                arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
+            else:
+                arg_str = f"const paddle::experimental::Tensor& {name}"
         else:
             assert IsVectorTensorType(ttype)
-            inputs_args_definition_list[
-                pos] = f"const std::vector<paddle::experimental::Tensor>& {name}"
-            inputs_args_declaration_list[
-                pos] = f"const std::vector<paddle::experimental::Tensor>& {name}"
+            arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
+
+        inputs_args_definition_list[pos] = arg_str
+        inputs_args_declaration_list[pos] = arg_str
 
     for name, atype, default_val, pos in forward_attrs_list:
         inputs_call_list[pos] = name
@@ -776,13 +825,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     inputs_call_args_str = ", ".join(inputs_call_list)
 
     # Forward Full Logic
-    forward_call_str = f"auto api_result = paddle::experimental::{fwd_api_name}({inputs_call_args_str});"
+    if len(intermediate_outputs) == 0:
+        function_name = fwd_api_name
+    else:
+        function_name = fwd_api_name + "_intermediate"
+    forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
 
     # Get return type list & outputs
-    num_outputs = len(forward_outputs_position_map.keys())
+    num_outputs = len(forward_outputs_position_map.keys()) - len(
+        intermediate_outputs)
     returns_type_list = ["" for i in range(num_outputs)]
     returns_list = ["" for i in range(num_outputs)]
     for name, (rtype, pos) in forward_outputs_position_map.items():
+        if name in intermediate_outputs:
+            continue
         if num_outputs == 1:
             returns_list[0] = f"api_result"
         else:
@@ -808,7 +864,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         fwd_api_name, bwd_api_name, forward_inputs_position_map,
         forward_outputs_position_map, forward_attrs_list,
         backward_fwd_input_map, backward_grad_input_map,
-        backward_grad_output_map, backward_attrs_list)
+        backward_grad_output_map, backward_attrs_list, optional_inputs)
 
     FORWARD_FUNCTION_TEMPLATE = """
 {} {}({}) {{
@@ -997,6 +1053,10 @@ if __name__ == "__main__":
         assert 'output' in fwd_api.keys()
         assert 'backward' in fwd_api.keys()
 
+        no_need_buffer_set = set()
+        if 'no_need_buffer' in fwd_api.keys():
+            no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
+
         fwd_api_name = fwd_api['api']
         fwd_args_str = fwd_api['args']
         fwd_returns_str = fwd_api['output']
@@ -1008,6 +1068,12 @@ if __name__ == "__main__":
         assert 'args' in bwd_api.keys()
         assert 'output' in bwd_api.keys()
         assert 'forward' in bwd_api.keys()
+
+        # Parse Dispensable Inputs
+        optional_inputs = []
+        if 'optional' in fwd_api.keys():
+            optional_inputs = ParseDispensable(fwd_api['optional'])
+
         bwd_forward_str = bwd_api['forward']
         bwd_args_str = bwd_api['args']
         bwd_returns_str = bwd_api['output']
@@ -1019,6 +1085,12 @@ if __name__ == "__main__":
         print("Prased Forward Attrs List: ", forward_attrs_list)
         print("Parsed Forward Returns List: ", forward_returns_list)
 
+        intermediate_outputs = []
+        if 'intermediate' in fwd_api.keys():
+            intermediate_outputs = ParseIntermediate(fwd_api['intermediate'])
+
+        IntermediateValidationCheck(intermediate_outputs, forward_returns_list)
+
         # Collect Original Forward Inputs/Outputs and then perform validation checks
         orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
             fwd_args_str, fwd_returns_str)
@@ -1062,7 +1134,8 @@ if __name__ == "__main__":
 
         # Node Declaration Generation
         node_declaration_str += GenerateNodeDeclaration(
-            fwd_api_name, backward_fwd_input_map, backward_attrs_list)
+            fwd_api_name, backward_fwd_input_map, backward_attrs_list,
+            no_need_buffer_set)
         print("Generated Node Declaration: ", node_declaration_str)
 
         node_definition_str += GenerateNodeDefinition(
@@ -1076,7 +1149,8 @@ if __name__ == "__main__":
             fwd_api_name, bwd_api_name, forward_inputs_position_map,
             forward_outputs_position_map, forward_attrs_list,
             backward_fwd_input_map, backward_grad_input_map,
-            backward_grad_output_map, backward_attrs_list)
+            backward_grad_output_map, backward_attrs_list, optional_inputs,
+            intermediate_outputs)
         print("Generated Forward Definition: ", forward_definition_str)
         print("Generated Forward Declaration: ", forward_declaration_str)
         forward_definition_str += definition_declaration_pair[0]
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index a95d6dce29aad275bd4df220b68f1f5b2302c189..9329dc5ffc9dd0faa36b8ff6a8373387bc2678c7 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,7 +14,7 @@
 
 import os
 import argparse
-from eager_gen import ReadFwdFile, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
 
 atype_to_parsing_function = {
     "bool": "CastPyArg2Boolean",
@@ -70,10 +70,12 @@ def FindParsingFunctionFromAttributeType(atype):
 
 
 def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
-                            forward_attrs_list, forward_outputs_position_map):
+                            forward_attrs_list, forward_outputs_position_map,
+                            optional_inputs):
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
     # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
+    # optional_inputs = [name0, ...]
 
     # Get EagerTensor from args
     # Get dygraph function call args
@@ -82,7 +84,14 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
     dygraph_function_call_list = ["" for i in range(num_args)]
     get_eager_tensor_str = ""
     for name, (ttype, pos) in forward_inputs_position_map.items():
-        get_eager_tensor_str += f"    auto& {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
+        is_optional = (name in optional_inputs)
+        if IsVectorTensorType(ttype):
+            get_eager_tensor_str += f"    auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
+        else:
+            if is_optional:
+                get_eager_tensor_str += f"    auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
+            else:
+                get_eager_tensor_str += f"    auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
         dygraph_function_call_list[pos] = f"{name}"
 
     parse_attributes_str = ""
@@ -134,7 +143,7 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
         fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
         GetForwardFunctionName(fwd_api_name), dygraph_function_call_str)
 
-    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}},\n"
+    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
 
     return python_c_function_str, python_c_function_reg_str
 
@@ -188,7 +197,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
     """
 
     core_ops_infos_registry = """
-    {\"get_final_state_core_ops_args_info\",
+    ,{\"get_final_state_core_ops_args_info\",
     (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
     \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
     {\"get_final_state_core_ops_args_type_info\",
@@ -267,6 +276,11 @@ if __name__ == "__main__":
         fwd_args_str = fwd_api['args']
         fwd_returns_str = fwd_api['output']
 
+        # Parse Dispensable Inputs
+        optional_inputs = []
+        if 'optional' in fwd_api.keys():
+            optional_inputs = ParseDispensable(fwd_api['optional'])
+
         # Collect Original Forward Inputs/Outputs and then perform validation checks
         forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
             fwd_args_str, fwd_returns_str)
@@ -283,7 +297,7 @@ if __name__ == "__main__":
 
         python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
             fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-            forward_outputs_position_map)
+            forward_outputs_position_map, optional_inputs)
         python_c_function_list.append(python_c_function_str)
         python_c_function_reg_list.append(python_c_function_reg_str)
         print("Generated Python-C Function: ", python_c_function_str)
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index f4b2b8e08d4fa465c1c3d868659d69f55c4223ea..9e1dc4f2c8c6ba5c1c7d0c49e5d141d1a6c4c6d3 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -97,6 +97,7 @@ class AutogradMeta : public AbstractAutogradMeta {
             "Should Not set NULL as GradNode pointer, since "
             "our default Edge and autogradMeta has nullptr for "
             "grad node. Set Nullptr will lead error."));
+
     grad_node_ = grad_node;
   }
 
@@ -127,6 +128,12 @@ class AutogradMeta : public AbstractAutogradMeta {
     stop_gradient_ = static_cast<int>(stop_gradient);
   }
 
+  void WeakSetStopGradient(bool stop_gradient) {
+    if (stop_gradient_ == -1) {
+      stop_gradient_ = static_cast<int>(stop_gradient);
+    }
+  }
+
   bool Persistable() const { return persistable_; }
 
   void SetPersistable(bool persistable) { persistable_ = persistable; }
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 42a3a13e5f70aef673e17521bf2fc57ed3869550..41e57ef1a15b0181c23b8e3f4b1bba12218a24f7 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -14,10 +14,10 @@
 
 #pragma once
 // framework deps
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
-// pten deps
+// Phi deps
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/api_declare.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
@@ -31,7 +31,7 @@
  * provide variable in
  * paddle::framework::ExecutionContext to support it. We should remove this as
  * soon as we finish our latest
- * Pten Lib, and use paddle::experimental::Tensor instead.
+ * Phi Lib, and use paddle::experimental::Tensor instead.
  *
  * Note: Keep this class as clean as possible.
  * This class should only support method declared in
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 598b368c6426a0fde9b286fc92d8e5a01660ef0a..35416281f188892ec11413a19abad9b3e5c29e76 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -53,7 +53,7 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       } else {
-        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       }
@@ -69,13 +69,16 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
   if (meta && !meta->StopGradient()) {
-    VLOG(6) << "Add Edges for slot: " << slot_id;
     auto node = meta->GetMutableGradNode();
     if (node) {
+      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
+              << this->name() << " to " << meta->GetMutableGradNode()->name();
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                        meta->OutRankInfo());
     } else {
-      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
+      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
+              << this->name() << " to " << meta->GetMutableGradNode()->name();
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                        meta->OutRankInfo());
     }
@@ -207,22 +210,22 @@ const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
   return adj_edges_;
 }
 
-void GradNodeBase::RegisterGradientHook(
-    size_t slot_id, size_t rank,
-    const std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook) {
-  gradient_hooks_.emplace_back(std::make_tuple(slot_id, rank, hook));
+int64_t GradNodeBase::RegisterGradientHook(
+    size_t slot_id, size_t rank, std::shared_ptr<egr::TensorHook>&& hook) {
+  gradient_hooks_.emplace(next_hook_id_,
+                          std::make_tuple(slot_id, rank, std::move(hook)));
+  return next_hook_id_++;
 }
 
 std::vector<std::vector<paddle::experimental::Tensor>>
 GradNodeBase::ApplyGradientHooks(
     const std::vector<std::vector<paddle::experimental::Tensor>>& tensors) {
   std::vector<std::vector<paddle::experimental::Tensor>> outs(tensors.size());
-  for (auto& tuple : gradient_hooks_) {
-    size_t slot_id = std::get<0>(tuple);
-    size_t rank = std::get<1>(tuple);
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook = std::get<2>(tuple);
+  for (auto& hook_pair : gradient_hooks_) {
+    size_t slot_id = std::get<0>(hook_pair.second);
+    size_t rank = std::get<1>(hook_pair.second);
+
+    auto hook = std::get<2>(hook_pair.second);
 
     PADDLE_ENFORCE(slot_id < tensors.size(),
                    paddle::platform::errors::Fatal(
@@ -239,12 +242,11 @@ GradNodeBase::ApplyGradientHooks(
     slot_out.resize(tensors[slot_id].size());
     paddle::experimental::Tensor& out = slot_out[rank];
     if (!out.defined() || !out.initialized()) {
-      VLOG(8) << "Run Hook for tensor: " << tensors[slot_id][rank].name();
-      out = hook(tensors[slot_id][rank]);
+      out = (*hook)(tensors[slot_id][rank]);
     } else {
       // If more than one hook is registered, the input to the next hook func
       // should be the output of the previous hook
-      out = hook(out);
+      out = (*hook)(out);
     }
   }
 
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 8603d84fe8df597a69f041e2fec41d05dfe16448..eeac1cca4acf33190ce30613e4a86e99a95b651b 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/api/all.h"
 
 namespace egr {
@@ -135,18 +136,30 @@ class GradNodeBase {
   /**
    * Register GradientHook
    * **/
-  void RegisterGradientHook(size_t slot_id, size_t rank,
-                            const std::function<paddle::experimental::Tensor(
-                                const paddle::experimental::Tensor&)>& hook);
+  int64_t RegisterGradientHook(size_t slot_id, size_t rank,
+                               std::shared_ptr<egr::TensorHook>&& hook);
+
+  /**
+  * Remove GradientHook
+  * **/
+  bool RemoveGradientHook(const int64_t& hook_id) {
+    auto remove_cnt = gradient_hooks_.erase(hook_id);
+    if (remove_cnt == 0) {
+      return false;
+    }
+    return true;
+  }
 
   /**
    * Apply GradientHook
    * **/
-  inline bool GradientHooksRegistered() { return gradient_hooks_.size() != 0; }
+  inline bool GradientHooksRegistered() { return !gradient_hooks_.empty(); }
 
   std::vector<std::vector<paddle::experimental::Tensor>> ApplyGradientHooks(
       const std::vector<std::vector<paddle::experimental::Tensor>>& tensors);
 
+  virtual std::string name() { return "GradNodeBase"; }
+
  private:
   // TODO(jiabin): Use SmallVector instead after merge PR from develop
 
@@ -164,12 +177,14 @@ class GradNodeBase {
   // Gradient Hooks
   // Customer may register a list of hooks which will be called in order during
   // backward
-  // Each entry consists one pair of <out_rank, std::function>
-  std::vector<std::tuple<
-      /* slot id */ size_t, /* rank */ size_t,
-      /* hook */ std::function<paddle::experimental::Tensor(
-          const paddle::experimental::Tensor&)>>>
+  // Each entry consists one pair of
+  // <hook_id, <out_rank, std::shared_ptr<TensorHook>>>
+  std::map<int64_t, std::tuple<
+                        /* slot id */ size_t, /* rank */ size_t,
+                        /* hook */ std::shared_ptr<TensorHook>>>
       gradient_hooks_;
+
+  int64_t next_hook_id_{0};
 };
 
 class Edge {
diff --git a/paddle/fluid/eager/hooks.h b/paddle/fluid/eager/hooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..097150cf5ed59d0fdb9dda49e03eb75e6f1b4207
--- /dev/null
+++ b/paddle/fluid/eager/hooks.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/phi/api/include/tensor.h"
+namespace egr {
+
+class TensorHook {
+ public:
+  virtual ~TensorHook() = default;
+  virtual paddle::experimental::Tensor operator()(
+      const paddle::experimental::Tensor& var) = 0;
+};
+
+class TensorVoidHook {
+ public:
+  virtual ~TensorVoidHook() = default;
+  virtual void operator()() = 0;
+};
+
+class CppTensorHook : public TensorHook {
+ public:
+  explicit CppTensorHook(std::function<paddle::experimental::Tensor(
+                             const paddle::experimental::Tensor&)>&& fn)
+      : fn_(std::move(fn)) {}
+
+  paddle::experimental::Tensor operator()(
+      const paddle::experimental::Tensor& var) override {
+    return fn_(var);
+  }
+
+ private:
+  std::function<paddle::experimental::Tensor(
+      const paddle::experimental::Tensor&)>
+      fn_;
+};
+
+class CppTensorVoidHook : public TensorVoidHook {
+ public:
+  explicit CppTensorVoidHook(std::function<void()>&& fn) : fn_(std::move(fn)) {}
+
+  void operator()() override { return fn_(); }
+
+ private:
+  std::function<void()> fn_;
+};
+}  // namespace egr
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 6cc17b0a9c5faf9d54a78d21cc4970880140f8fd..31aaa93c41643f565836c536d7001c01d2a0826d 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -34,7 +34,8 @@ class TensorWrapper {
  public:
   TensorWrapper() = default;
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
-                         bool full_reserved = false) {
+                         bool full_reserved = false,
+                         bool no_need_buffer = false) {
     /**
      * Normally, we should fully reserved all non-output or non-leaf fwd tensor
      * here. And for fwd output tensor, we should not reserve its autogradmeta,
@@ -48,16 +49,30 @@ class TensorWrapper {
     }
 
     // shallow copy tensor_impl here
-    intermidiate_tensor_.set_impl(tensor.impl());
+    if (no_need_buffer) {
+      if (phi::DenseTensor::classof(tensor.impl().get())) {
+        // Only Copy Meta
+        phi::DenseTensor* dense_tensor =
+            static_cast<phi::DenseTensor*>(tensor.impl().get());
+        auto tw_dense_tensor = std::make_shared<phi::DenseTensor>();
+        tw_dense_tensor->set_meta(dense_tensor->meta());
+        intermidiate_tensor_.set_impl(tw_dense_tensor);
+      } else {
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "Unrecognized tensor type for no_need_buffer feature"));
+      }
+    } else {
+      intermidiate_tensor_.set_impl(tensor.impl());
+    }
+
     intermidiate_tensor_.set_name(tensor.name() + "@Saved");
-    PADDLE_ENFORCE_NOT_NULL(
-        EagerUtils::unsafe_autograd_meta(tensor),
-        paddle::platform::errors::Fatal(
-            "Full reserved Tensor should not have null autograd meta, since "
-            "tensor_wrapper is used to build backward info. There is no way "
-            "for us to build it with null autograd_meta."));
-    // copy output_rank
-    out_rank_info_ = EagerUtils::OutRankInfo(tensor);
+
+    // If an output is marked "intermedaite", we won't create
+    // autograd_meta for it.
+    // In that case, simply skip OutRankInfo Copy
+    if (EagerUtils::nullable_autograd_meta(tensor)) {
+      out_rank_info_ = EagerUtils::OutRankInfo(tensor);
+    }
   }
 
   paddle::experimental::Tensor recover(
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index 682e55e7d92945e13a219c956f373e800c174325..28682ab0fe094df6d27eb27e9118e6576685c95a 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -17,11 +17,14 @@
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/fluid/eager/utils.h"
 
+#include "paddle/fluid/eager/hooks.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 // TODO(jiabin): remove nolint here!!!
@@ -37,7 +40,7 @@ TEST(AccumulationNode, Tensor) {
           .get(),
       meta);
   dt0->mutable_data<paddle::platform::float16>(
-      paddle::platform::CPUPlace())[0] = 10.0;
+      paddle::platform::CPUPlace())[0] = paddle::platform::float16(10.0f);
   paddle::experimental::Tensor et0 = paddle::experimental::Tensor(dt0);
 
   std::shared_ptr<phi::DenseTensor> dt1 = std::make_shared<phi::DenseTensor>(
@@ -47,84 +50,102 @@ TEST(AccumulationNode, Tensor) {
       meta);
 
   dt1->mutable_data<paddle::platform::float16>(
-      paddle::platform::CPUPlace())[0] = 20.0;
+      paddle::platform::CPUPlace())[0] = paddle::platform::float16(20.0f);
   paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1);
 
+  std::shared_ptr<phi::DenseTensor> input_dt =
+      std::make_shared<phi::DenseTensor>(
+          std::make_unique<paddle::experimental::DefaultAllocator>(
+              paddle::platform::CPUPlace())
+              .get(),
+          meta);
+  paddle::experimental::Tensor input_et =
+      paddle::experimental::Tensor(input_dt);
+  auto grad_meta = EagerUtils::autograd_meta(&input_et);
+
+  // Initialize Grad Tensor
   std::shared_ptr<phi::DenseTensor> grad_dt =
       std::make_shared<phi::DenseTensor>(
           std::make_unique<paddle::experimental::DefaultAllocator>(
               paddle::platform::CPUPlace())
               .get(),
           meta);
-  paddle::experimental::Tensor grad_et = paddle::experimental::Tensor(grad_dt);
+  grad_dt->mutable_data<paddle::platform::float16>(
+      paddle::platform::CPUPlace())[0] = paddle::platform::float16(0.0f);
+  grad_meta->MutableGrad()->set_impl(grad_dt);
 
   // AccumulationNode
-  GradNodeAccumulation node = GradNodeAccumulation();
-
-  // Hook, RetainGrad
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = [&grad_et](const paddle::experimental::Tensor& t) {
-        grad_et.set_impl(t.impl());
-        return grad_et;
-      };
-  node.RetainGrad(hook);
+  auto node = std::make_shared<GradNodeAccumulation>(grad_meta);
+  grad_meta->SetGradNode(node);
+  grad_meta->SetStopGradient(false);
 
   // operator()
-  paddle::experimental::Tensor ret_et0 = node({{et0}})[0][0];
+  paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0];
   auto* ret_et0_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
           ->data<paddle::platform::float16>();
   CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));
 
-  paddle::experimental::Tensor ret_et1 = node({{et1}})[0][0];
+  paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0];
+
   auto* ret_et1_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et1.impl())
           ->data<paddle::platform::float16>();
-  CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(30.0f));
+  CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(20.0f));
 
-  // Retain Grad
-  auto* ret_grad_et_ptr =
-      std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
-          ->data<paddle::platform::float16>();
-  CHECK_EQ(ret_grad_et_ptr[0], paddle::platform::float16(30.0f));
+  // Check Retain Grad
+  CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
+               ->data<paddle::platform::float16>()[0],
+           paddle::platform::float16(10.0f));
+  paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et);
+  auto* grad_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl())
+                       ->data<paddle::platform::float16>();
+  CHECK_EQ(grad_ptr[0], paddle::platform::float16(30.0f));
 
   // Reduce Hook case 1: Call RegisterReduceHook and run operator()
   VLOG(6) << "Test Reduce Hook";
+  CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
+               ->data<paddle::platform::float16>()[0],
+           paddle::platform::float16(10.0f));
+
   auto reduce_hook_1 = [&](void) -> void {
-    auto* grad_et_ptr =
-        std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
-            ->data<paddle::platform::float16>();
-    grad_et_ptr[0] = 36.0;
+    auto* input_et_ptr =
+        std::dynamic_pointer_cast<phi::DenseTensor>(input_et.impl())
+            ->mutable_data<paddle::platform::float16>(
+                paddle::platform::CPUPlace());
+    input_et_ptr[0] = 36.0;
     VLOG(6) << "Running Reduce Hook";
   };
 
-  node.RegisterReduceHook(reduce_hook_1);
+  node->RegisterReduceHook(
+      std::make_shared<egr::CppTensorVoidHook>(reduce_hook_1));
 
   // operator()
-  paddle::experimental::Tensor _ret = node({{et0}})[0][0];
+  paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0];
 
   // Check operator() result, should be 36.0
   auto* _ret_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(_ret.impl())
                        ->data<paddle::platform::float16>();
-  CHECK_EQ(_ret_ptr[0], paddle::platform::float16(36.0f));
+  CHECK_EQ(_ret_ptr[0], paddle::platform::float16(10.0f));
 
   // Check Retain Grad, should be 36.0
-  auto* _ret_grad_et_ptr =
-      std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
+  auto* _ret_input_et_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(input_et.impl())
           ->data<paddle::platform::float16>();
-  CHECK_EQ(_ret_grad_et_ptr[0], paddle::platform::float16(36.0f));
+  CHECK_EQ(_ret_input_et_ptr[0], paddle::platform::float16(36.0f));
 
   // Reduce Hook case 2: Call RegisterReduceHook and ApplyReduceHooks directly
   VLOG(6) << "Test Reduce Hook";
   auto reduce_hook_2 = [&](void) -> void {
     auto* ret_et0_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
-                            ->data<paddle::platform::float16>();
+                            ->mutable_data<paddle::platform::float16>(
+                                paddle::platform::CPUPlace());
     ret_et0_ptr[0] = 100.0;  // set to 100.0
     VLOG(6) << "Running Reduce Hook";
   };
-  node.RegisterReduceHook(reduce_hook_2);
-  node.ApplyReduceHooks();
+  node->RegisterReduceHook(
+      std::make_shared<egr::CppTensorVoidHook>(reduce_hook_2));
+  node->ApplyReduceHooks();
 
   // Check ApplyReduceHooks result
   CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index aee6ee7488671930664e2accdea89a7d872c9583..e3db309c4016a512c5379fb352beb4af690a271e 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 
@@ -32,7 +33,7 @@ TEST(GradNodeInfo, GradSlotMeta) {
   CHECK_EQ(grad_slot.Size(), 2);
 }
 
-TEST(GradNodeInfo, GradNodeBase) {
+void TestGradNodeBase(bool is_remove_gradient_hook) {
   VLOG(6) << "Construct Grad Node";
   auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(
       /* val */ 5.0, /* in_num */ 2, /* out_num */ 2);
@@ -112,13 +113,25 @@ TEST(GradNodeInfo, GradNodeBase) {
     VLOG(6) << "Running Gradient Hook";
     return res;
   };
-  grad_test_node0->RegisterGradientHook(0, 0, gradient_hook);
-  // 5 + 6
+  int64_t hook_id = grad_test_node0->RegisterGradientHook(
+      0, 0, std::make_shared<egr::CppTensorHook>(gradient_hook));
+
+  if (is_remove_gradient_hook) {
+    // Remove GradientHook
+    grad_test_node0->RemoveGradientHook(hook_id);
+  }
+
+  // Check results
   auto grad_hook_res = grad_test_node0->ApplyGradientHooks(grads);
   CHECK_EQ(
       std::dynamic_pointer_cast<phi::DenseTensor>(grad_hook_res[0][0].impl())
           ->data<float>()[0],
-      11.0);
+      is_remove_gradient_hook ? 5.0 : 11.0);
+}
+
+TEST(GradNodeInfo, GradNodeBase) {
+  TestGradNodeBase(true);
+  TestGradNodeBase(false);
 }
 
 TEST(GradNodeInfo, Edge) {
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 771b324a69b5a99e9f4857552cc8c5d8b25b5c90..a4bc56bd606f3fbb0f9152d58acb5c8edeecf905 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -59,22 +59,18 @@ TEST(Backward, SingleNodeEmptyGrad) {
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta->SetStopGradient(false);
 
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+
     // Connect Tensor and AccumulationNode via AutoGradMeta
-    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
 
-    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     auto_grad_meta1->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
 
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-
-    // Connect Node0 -> AccumulationNode via Edge
-    auto meta = egr::AutogradMeta();
-    meta.SetStopGradient(false);
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
     node0_ptr->AddEdges(&res, 0);
   }
   std::vector<paddle::experimental::Tensor> outs = {target_tensor};
@@ -123,22 +119,17 @@ TEST(Backward, SingleNodeCustomGrad) {
         std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta->SetStopGradient(false);
-    // Connect Tensor and AccumulationNode via AutoGradMeta
-    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
 
     AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
     auto_grad_meta1->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
-
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-
-    // Connect Node0 -> AccumulationNode via Edge
-    auto meta = egr::AutogradMeta();
-    meta.SetStopGradient(false);
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
     node0_ptr->AddEdges(&res, 0);
   }
 
@@ -201,22 +192,17 @@ TEST(Backward, LinearNodes) {
     std::vector<egr::AutogradMeta*> res0 = {&meta0};
     node0_ptr->AddEdges(&res0, 0);
 
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
-    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
 
-    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     auto_grad_meta1->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-
-    // Connect Node1 -> AccumulationNode via Edge
-    auto meta1 = egr::AutogradMeta();
-    meta1.SetStopGradient(false);
-    meta1.SetSingleOutRankWithSlot(0, 0);
-    meta1.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
     node1_ptr->AddEdges(&res1, 0);
   }
 
@@ -311,22 +297,17 @@ TEST(Backward, WithAccumulation) {
     std::vector<egr::AutogradMeta*> res1 = {&meta1};
     node1_ptr->AddEdges(&res1, 0);
 
+    AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
-    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
 
-    AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
     auto_grad_meta2->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
 
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-
-    // Connect Node2 -> AccumulationNode via Edge
-    auto meta2 = egr::AutogradMeta();
-    meta2.SetStopGradient(false);
-    meta2.SetSingleOutRankWithSlot(0, 0);
-    meta2.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res2 = {&meta2};
+    auto_grad_meta2->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
     node2_ptr->AddEdges(&res2, 0);
   }
 
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index a44ca6fcffbff537dbcb46017f7c4953bd3d984c..524872b2e55638d25697388aa50724f49f6e3818 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -46,34 +46,26 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
   paddle::experimental::Tensor& target_tensor = target_tensors[0];
 
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
-  {
-    auto scale_node_ptr = std::make_shared<GradNodeScale>(1, 1);
-    scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
-
-    scale_node_ptr->SetDefaultGradInOutMeta();
-
-    auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
-
-    AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor);
-    auto_grad_meta->SetGradNode(
-        std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
-    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-    auto_grad_meta->SetStopGradient(false);
-    egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
-
-    auto meta = AutogradMeta();
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetStopGradient(false);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
-    scale_node_ptr->AddEdges(&res, 0);
-
-    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
-    auto_grad_meta1->SetGradNode(
-        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
-    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-  }
+
+  auto scale_node_ptr = std::make_shared<GradNodeScale>(1, 1);
+  scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+  scale_node_ptr->SetDefaultGradInOutMeta();
+
+  AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor);
+  auto_grad_meta->SetGradNode(
+      std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
+  auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+  auto_grad_meta->SetStopGradient(false);
+  egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
+
+  AutogradMeta* meta = EagerUtils::autograd_meta(&leaf_tensor);
+  auto acc_node_ptr = std::make_shared<GradNodeAccumulation>(meta);
+  meta->SetStopGradient(false);
+  meta->SetSingleOutRankWithSlot(0, 0);
+  meta->SetGradNode(acc_node_ptr);
+  std::vector<egr::AutogradMeta*> res = {meta};
+  scale_node_ptr->AddEdges(&res, 0);
 
   RunBackward(target_tensors, {});
 
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 990f700056158fabe2314aa2f5bc9946c0e5076c..217055e4e9e4a19e695f42bf57c2331c9b98e2bd 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -159,7 +159,7 @@ TEST(EagerUtils, PassStopGradient) {
   CHECK(auto_grad0->StopGradient() == false);
   egr::EagerUtils::PassStopGradient(true, auto_grad0.get(), auto_grad1.get(),
                                     auto_grad2.get(), auto_grad3.get());
-  CHECK(auto_grad0->StopGradient() == true);
+  CHECK(auto_grad0->StopGradient() == false);
   CHECK(auto_grad1->StopGradient() == true);
   CHECK(auto_grad2->StopGradient() == true);
   CHECK(auto_grad3->StopGradient() == true);
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 752fd7812847c442f83d150eafa331360dfa8693..5a7bafb2fe37051c0ad054c130d77dd6e05319d2 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -27,6 +27,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
 namespace egr {
@@ -221,10 +222,6 @@ TEST(FwdBwdJoint, GradientHook) {
       phi::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
   egr_utils_api::RetainGradForTensor(tensor);
 
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = &hook_function;
-
   // 3. Run Forward
   // Run Forward Node 0
   float scale0 = 2.0;
@@ -232,24 +229,27 @@ TEST(FwdBwdJoint, GradientHook) {
   paddle::experimental::Tensor out0 =
       egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/,
                  true /*trace_backward*/);
-  egr_utils_api::RetainGradForTensor(out0);                  // hook: +5
-  egr_utils_api::RegisterGradientHookForTensor(out0, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out0);  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(
+      out0, std::make_shared<egr::CppTensorHook>(hook_function));  // hook: +5
 
   // Run Forward Node 1
   float scale1 = 5.0;
   float bias1 = 10.0;
   paddle::experimental::Tensor out1 = egr::scale(
       out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
-  egr_utils_api::RetainGradForTensor(out1);                  // hook: +5
-  egr_utils_api::RegisterGradientHookForTensor(out1, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out1);  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(
+      out1, std::make_shared<egr::CppTensorHook>(hook_function));  // hook: +5
 
   // Run Forward Node 2
   float scale2 = 10.0;
   float bias2 = 20.0;
   paddle::experimental::Tensor out2 = egr::scale(
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
-  egr_utils_api::RetainGradForTensor(out2);                  // hook: +5
-  egr_utils_api::RegisterGradientHookForTensor(out2, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out2);  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(
+      out2, std::make_shared<egr::CppTensorHook>(hook_function));  // hook: +5
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index bf2f620dd19bae44fabcbffedc1dcef6a1b52bf9..9cda961741f55e9b4b7fc8dac61fe4a7c96567cf 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -28,6 +28,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
 namespace egr {
@@ -79,16 +80,10 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
   // Set grad in/out meta for node0
   scale_node_ptr->SetDefaultGradInOutMeta();
 
-  // Create AccumulationNode
-  auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
-
   // Connect Input Tensor and ScaleNode via AutoGradMeta
   // Apply RetainGrad
   {
     // ScaleNode Hook: +3
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>
-        hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
     auto_grad_meta->SetGradNode(
@@ -99,38 +94,36 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
+    egr_utils_api::RegisterGradientHookForTensor(
+        target_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
+    egr_utils_api::RetainGradForTensor(
+        target_tensor);  // result: 1.0 + 3.0 = 4.0
     egr_utils_api::RetainGradForTensor(
         target_tensor);  // result: 1.0 + 3.0 = 4.0
-  }
-
-  // Connect ScaleNode -> AccumulationNode via Edge
-  {
-    auto meta = AutogradMeta();
-    meta.SetStopGradient(false);
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
-    scale_node_ptr->AddEdges(&res, 0);
   }
 
   // Retain Grad for leaf tensor1
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
   {
     // AccumulationNode Hook: +3
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>
-        hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
-    auto_grad_meta->SetGradNode(
-        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+
+    auto acc_node_ptr =
+        std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
+
+    auto_grad_meta->SetStopGradient(false);
+    auto_grad_meta->SetGradNode(acc_node_ptr);
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
+    scale_node_ptr->AddEdges(&res, 0);
+
     leaf_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
+    egr_utils_api::RegisterGradientHookForTensor(
+        leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
     egr_utils_api::RetainGradForTensor(
         leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
   }
@@ -160,16 +153,11 @@ TEST(RetainGrad, HookAfterRetainGrad) {
   scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
   // Set grad in/out meta for node0
   scale_node_ptr->SetDefaultGradInOutMeta();
-  // Create AccumulationNode
-  auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
 
   // Connect Input Tensor and ScaleNode via AutoGradMeta
   // Apply RetainGrad
   {
     // ScaleNode Hook: +3
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>
-        hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
     auto_grad_meta->SetGradNode(
@@ -181,40 +169,30 @@ TEST(RetainGrad, HookAfterRetainGrad) {
             auto_grad_meta));
 
     egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
-    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
-  }
-
-  // Connect ScaleNode -> AccumulationNode via Edge
-  {
-    auto meta = AutogradMeta();
-    meta.SetStopGradient(false);
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
-    scale_node_ptr->AddEdges(&res, 0);
+    egr_utils_api::RegisterGradientHookForTensor(
+        target_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
   }
 
   // Retain Grad for leaf tensor1
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
   {
     // AccumulationNode Hook: +3
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>
-        hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
-    auto_grad_meta->SetGradNode(
-        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto acc_node_ptr =
+        std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
+    auto_grad_meta->SetGradNode(acc_node_ptr);
+    auto_grad_meta->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
+    scale_node_ptr->AddEdges(&res, 0);
+
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     leaf_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    egr_utils_api::RetainGradForTensor(
-        leaf_tensor);  // RetainGrad for leaf tensor gets
-                       // postponed, result: 4.0*5.0 + 3.0 =
-                       // 23.0
-    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
+    egr_utils_api::RegisterGradientHookForTensor(
+        leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
   }
 
   RunBackward(target_tensors, {});
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index dbcfe704dbe1c31849b338dc4c1b9ea56e6ad667..15b2a62dca751859882e82d46acaa46f27c2c518 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -24,6 +24,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace egr {
@@ -54,7 +55,7 @@ paddle::experimental::Tensor hook_function(
   return ret;
 }
 
-TEST(Hook_intermidiate, Sigmoid) {
+void test_sigmoid(bool is_remove_gradient_hook) {
   // Prepare Device Contexts
   VLOG(6) << "Init Env";
   eager_test::InitEnv(paddle::platform::CPUPlace());
@@ -67,11 +68,6 @@ TEST(Hook_intermidiate, Sigmoid) {
       ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
       phi::DataLayout::NCHW, 0.0, true);
 
-  VLOG(6) << "Make Hook function";
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = &hook_function;
-
   VLOG(6) << "Make ReduceHook function";
   auto reduce_hook = [&](void) -> void {
     auto* t_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl())
@@ -85,10 +81,12 @@ TEST(Hook_intermidiate, Sigmoid) {
   egr_utils_api::RetainGradForTensor(tensor);
 
   VLOG(6) << "Register GradientHook for Tensor";
-  egr_utils_api::RegisterGradientHookForTensor(tensor, hook);
+  int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor(
+      tensor, std::make_shared<CppTensorHook>(hook_function));
 
   VLOG(6) << "Register ReduceHook for Tensor";
-  egr_utils_api::RegisterReduceHookForTensor(tensor, reduce_hook);
+  egr_utils_api::RegisterReduceHookForTensor(
+      tensor, std::make_shared<CppTensorVoidHook>(reduce_hook));
 
   VLOG(6) << "Runing Forward";
   auto output_tensor = sigmoid_dygraph_function(tensor, {});
@@ -98,11 +96,17 @@ TEST(Hook_intermidiate, Sigmoid) {
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
 
+  if (is_remove_gradient_hook) {
+    std::shared_ptr<GradNodeBase> grad_node_tmp = EagerUtils::grad_node(tensor);
+    grad_node_tmp->RemoveGradientHook(hook_id);
+  }
+
   VLOG(6) << "Runing Backward";
   RunBackward(target_tensors, {});
   VLOG(6) << "Finish Backward";
 
-  eager_test::CompareGradTensorWithValue<float>(tensor, 0.25 + 3);
+  eager_test::CompareGradTensorWithValue<float>(
+      tensor, is_remove_gradient_hook ? 0.25 : 0.25 + 3.0);
 
   VLOG(6) << "Checking ReduceHook results";
   for (int i = 0; i < tensor.numel(); i++) {
@@ -113,7 +117,7 @@ TEST(Hook_intermidiate, Sigmoid) {
   VLOG(6) << "After Tests";
 }
 
-TEST(Hook_intermidiate, ElementwiseAdd) {
+void test_elementwiseAdd(bool is_remove_gradient_hook) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
@@ -132,11 +136,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) {
       ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
       phi::DataLayout::NCHW, 2.0, true);
 
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = &hook_function;
-
-  auto reduce_hook = [&](void) -> void {
+  auto reduce_hook = [&]() -> void {
     auto* t_ptr =
         std::dynamic_pointer_cast<phi::DenseTensor>(Y.impl())->data<float>();
     for (int i = 0; i < Y.numel(); i++) {
@@ -145,18 +145,26 @@ TEST(Hook_intermidiate, ElementwiseAdd) {
   };
 
   egr_utils_api::RetainGradForTensor(Y);
-  egr_utils_api::RegisterGradientHookForTensor(Y, hook);
-  egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook);
+  int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor(
+      Y, std::make_shared<CppTensorHook>(hook_function));
+  egr_utils_api::RegisterReduceHookForTensor(
+      Y, std::make_shared<CppTensorVoidHook>(reduce_hook));
 
   auto output_tensor = elementwise_add_dygraph_function(X, Y, {});
 
   eager_test::CompareTensorWithValue<float>(output_tensor, 5);
-
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
+
+  if (is_remove_gradient_hook) {
+    std::shared_ptr<GradNodeBase> grad_node_tmp = EagerUtils::grad_node(Y);
+    grad_node_tmp->RemoveGradientHook(hook_id);
+  }
+
   RunBackward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
-  eager_test::CompareGradTensorWithValue<float>(Y, 4.0);
+  eager_test::CompareGradTensorWithValue<float>(
+      Y, is_remove_gradient_hook ? 1.0 : 1.0 + 3.0);
 
   // Checking ReduceHook results
   for (int i = 0; i < Y.numel(); i++) {
@@ -166,7 +174,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) {
   }
 }
 
-TEST(Hook_intermidiate, Matmul_v2) {
+void test_matmul(bool is_remove_gradient_hook) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
@@ -185,10 +193,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
       ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
       phi::DataLayout::NCHW, 2.0, true);
 
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = &hook_function;
-
   auto reduce_hook = [&](void) -> void {
     auto* t_ptr =
         std::dynamic_pointer_cast<phi::DenseTensor>(Y.impl())->data<float>();
@@ -198,19 +202,27 @@ TEST(Hook_intermidiate, Matmul_v2) {
   };
 
   egr_utils_api::RetainGradForTensor(Y);
-  egr_utils_api::RegisterGradientHookForTensor(Y, hook);
-  egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook);
+  int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor(
+      Y, std::make_shared<CppTensorHook>(hook_function));
+  egr_utils_api::RegisterReduceHookForTensor(
+      Y, std::make_shared<CppTensorVoidHook>(reduce_hook));
 
   auto output_tensor = matmul_v2_dygraph_function(
       X, Y, {{"trans_x", false}, {"trans_y", false}});
 
   eager_test::CompareTensorWithValue<float>(output_tensor, 96);
-
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
+
+  if (is_remove_gradient_hook) {
+    std::shared_ptr<GradNodeBase> grad_node_tmp = EagerUtils::grad_node(Y);
+    grad_node_tmp->RemoveGradientHook(hook_id);
+  }
+
   RunBackward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
-  eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4 + 3);
+  eager_test::CompareGradTensorWithValue<float>(
+      Y, is_remove_gradient_hook ? 3.0 * 4 : 3.0 * 4 + 3);
 
   // Checking ReduceHook results
   for (int i = 0; i < Y.numel(); i++) {
@@ -219,6 +231,22 @@ TEST(Hook_intermidiate, Matmul_v2) {
         static_cast<float>(100.0f));
   }
 }
+
+TEST(Hook_intermidiate, Sigmoid) {
+  // True or false represents whether to call RemoveGradientHook
+  test_sigmoid(true);
+  test_sigmoid(false);
+}
+
+TEST(Hook_intermidiate, ElementwiseAdd) {
+  test_elementwiseAdd(true);
+  test_elementwiseAdd(false);
+}
+
+TEST(Hook_intermidiate, Matmul_v2) {
+  test_matmul(true);
+  test_matmul(false);
+}
 }  // namespace egr
 
 USE_OP(sigmoid);
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 9c6c7d4d540c6b4a42b5bb9b266d7175c55b15ad..a7e5931f1f9bc66006fb1a37836be1eda371953e 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/tensor_wrapper.h"
@@ -21,9 +22,8 @@
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/tensor_meta.h"
 
-#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
 PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, true,
@@ -109,6 +109,16 @@ std::shared_ptr<GradNodeBase> EagerUtils::grad_node(
   }
 }
 
+paddle::experimental::Tensor* EagerUtils::mutable_grad(
+    const paddle::experimental::Tensor& target) {
+  auto* meta = nullable_autograd_meta(target);
+  if (meta) {
+    return meta->MutableGrad();
+  } else {
+    return nullptr;
+  }
+}
+
 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
   for (const auto& autograd_meta : *autograd_metas) {
@@ -220,53 +230,62 @@ paddle::experimental::Tensor EagerUtils::GetOutput(
   return paddle::experimental::Tensor(out->GetTensorBase(), out->name());
 }
 
-void EagerUtils::OverwriteOutputs(const std::shared_ptr<EagerVariable>& out,
-                                  paddle::experimental::Tensor* tensor) {
+void EagerUtils::GetOutput(const std::shared_ptr<EagerVariable>& out,
+                           paddle::experimental::Tensor* out_var) {
   PADDLE_ENFORCE_NOT_NULL(
-      tensor, paddle::platform::errors::Fatal(
-                  "Tensor is null and cannot be copied. "
-                  "We are tring to OverwriteOutput from its "
-                  "shared_ptr, this error may indicate some outputs "
-                  "are nullptr"));
-  tensor->set_impl(out->GetTensorBase());
+      out_var, paddle::platform::errors::Fatal(
+                   "Tensor is null and cannot be copied. "
+                   "We are tring to OverwriteOutput from its "
+                   "shared_ptr, this error may indicate some outputs "
+                   "are nullptr"));
+  out_var->set_impl(out->GetTensorBase());
 }
 
-void EagerUtils::OverwriteOutputs(
+void EagerUtils::GetOutputs(
     const std::vector<std::shared_ptr<EagerVariable>>& outs,
-    const std::vector<paddle::experimental::Tensor*>& tensors) {
-  PADDLE_ENFORCE_EQ(
-      outs.size(), tensors.size(),
-      paddle::platform::errors::Fatal(
-          "We are tring to OverwriteOutputs which passed in and it expected "
-          "elements num of outs and origin outputs are equal, but we got outs "
-          "size of: %d, and tensors passed in size is: %d",
-          outs.size(), tensors.size()));
+    std::vector<paddle::experimental::Tensor>* result) {
   for (size_t i = 0; i < outs.size(); i++) {
-    OverwriteOutputs(outs[i], tensors[i]);
+    result->emplace_back(outs[i]->GetTensorBase());
   }
 }
 
-void EagerUtils::OverwriteOutputs(const paddle::experimental::Tensor& out,
-                                  paddle::experimental::Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(
-      tensor, paddle::platform::errors::Fatal(
-                  "Tensor is null and cannot be copied. "
-                  "We are tring to OverwriteOutput from its "
-                  "shared_ptr, this error may indicate some outputs "
-                  "are nullptr"));
-  *tensor = out;
-}
-void EagerUtils::OverwriteOutputs(
-    const std::vector<paddle::experimental::Tensor>& outs,
-    const std::vector<paddle::experimental::Tensor*>& tensors) {
+void EagerUtils::GetOutputs(
+    const std::vector<std::shared_ptr<EagerVariable>>& outs,
+    const std::vector<paddle::experimental::Tensor*>& out_var) {
   for (size_t i = 0; i < outs.size(); i++) {
     PADDLE_ENFORCE_NOT_NULL(
-        tensors[i], paddle::platform::errors::Fatal(
+        out_var[i], paddle::platform::errors::Fatal(
                         "Tensor is null and cannot be copied. "
                         "We are tring to OverwriteOutput from its "
                         "shared_ptr, this error may indicate some outputs "
                         "are nullptr"));
-    *tensors[i] = outs[i];
+    out_var[i]->set_impl(outs[i]->GetTensorBase());
+  }
+}
+
+void EagerUtils::GetOutputs(const std::shared_ptr<EagerVariable>& out,
+                            std::vector<paddle::experimental::Tensor>* result) {
+  result->emplace_back(out->GetTensorBase());
+}
+
+void EagerUtils::GetOutputs(
+    const std::shared_ptr<EagerVariable>& out,
+    const std::vector<paddle::experimental::Tensor*>& out_var) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out_var[0], paddle::platform::errors::Fatal(
+                      "Tensor is null and cannot be copied. "
+                      "We are tring to OverwriteOutput from its "
+                      "shared_ptr, this error may indicate some outputs "
+                      "are nullptr"));
+  out_var[0]->set_impl(out->GetTensorBase());
+}
+
+void EagerUtils::Output2Result(
+    const std::vector<paddle::experimental::Tensor*>& out_var,
+    std::vector<paddle::experimental::Tensor>* result) {
+  result->reserve(out_var.size());
+  for (size_t i = 0; i < out_var.size(); i++) {
+    result->emplace_back(*out_var[i]);
   }
 }
 
@@ -333,7 +352,8 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
   } else {
     if (!autograd_ptr->StopGradient()) {
       VLOG(6) << "Add GradNodeAccumulation for tensor: " << tensor.name();
-      autograd_ptr->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      autograd_ptr->SetGradNode(
+          std::make_shared<egr::GradNodeAccumulation>(autograd_ptr));
       return autograd_ptr->GetMutableGradNode();
     } else {
       return nullptr;
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 00013faa345e213a125a2fe6c70eef1e0b9160ef..fa5735e6f32a0ca7762b9ba94cce26ac8ac567dd 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -77,7 +77,7 @@ class PassStopGradientIter : public IterHelper<AutogradMeta*> {
       VLOG(2) << "Tensor is NULL";
       return;
     }
-    element->SetStopGradient(stop_gradient_);
+    element->WeakSetStopGradient(stop_gradient_);
   }
 
   bool stop_gradient_ = true;
@@ -102,6 +102,8 @@ class EagerUtils {
 
   static std::shared_ptr<GradNodeBase> grad_node(
       const paddle::experimental::Tensor& target);
+  static paddle::experimental::Tensor* mutable_grad(
+      const paddle::experimental::Tensor& target);
 
   // Set history is used to set backward info during forward process, it will
   // set forward var's autograd meta's grad node as current backward node.
@@ -173,17 +175,24 @@ class EagerUtils {
       const std::vector<std::shared_ptr<EagerVariable>>& outs);
   static paddle::experimental::Tensor GetOutput(
       const std::shared_ptr<EagerVariable>& out);
-  // Sync Back to origin output Tensor
-  static void OverwriteOutputs(const std::shared_ptr<EagerVariable>& out,
-                               paddle::experimental::Tensor* tensor);
-  static void OverwriteOutputs(const paddle::experimental::Tensor& out,
-                               paddle::experimental::Tensor* tensor);
-  static void OverwriteOutputs(
+  static void GetOutput(const std::shared_ptr<EagerVariable>& out,
+                        paddle::experimental::Tensor* out_var);
+  static void GetOutputs(
       const std::vector<std::shared_ptr<EagerVariable>>& outs,
-      const std::vector<paddle::experimental::Tensor*>& tensors);
-  static void OverwriteOutputs(
-      const std::vector<paddle::experimental::Tensor>& outs,
-      const std::vector<paddle::experimental::Tensor*>& tensors);
+      std::vector<paddle::experimental::Tensor>* result);
+  static void GetOutputs(
+      const std::vector<std::shared_ptr<EagerVariable>>& outs,
+      const std::vector<paddle::experimental::Tensor*>& out_var);
+  static void GetOutputs(const std::shared_ptr<EagerVariable>& out,
+                         std::vector<paddle::experimental::Tensor>* result);
+  static void GetOutputs(
+      const std::shared_ptr<EagerVariable>& out,
+      const std::vector<paddle::experimental::Tensor*>& out_var);
+
+  static void Output2Result(
+      const std::vector<paddle::experimental::Tensor*>& out_var,
+      std::vector<paddle::experimental::Tensor>* result);
+
   // end Intermidate needed
 
   static void CheckAndRetainGrad(const paddle::experimental::Tensor& tensor);
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 78f5bb077aaf189ff0d21aba853d62aebe46f53e..14aecb5fd43c49ece1f79cb9c8e2b70e9d07df07 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -193,19 +193,19 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va
 cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 
 IF(WITH_XPU)
-cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info xpu_op_list)
+cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info xpu_op_list)
 ELSE()
-cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info)
+cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info)
 ENDIF()
 
 IF(WITH_XPU)
 cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
-    pten pten_utils kernel_factory infershape_utils op_utils)
+    phi phi_utils kernel_factory infershape_utils op_utils)
 ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
-    pten pten_utils kernel_factory infershape_utils op_utils)
+    phi phi_utils kernel_factory infershape_utils op_utils)
 ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@@ -412,7 +412,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
-cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place pten var_type_traits pten pten_api_utils op_info shape_inference)
+cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place phi var_type_traits phi phi_api_utils op_info shape_inference)
 cc_test(infershape_utils_test SRCS infershape_utils_test.cc DEPS infershape_utils infermeta_utils meta_tensor)
 
 # Get the current working branch
@@ -436,9 +436,8 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 
 configure_file(commit.h.in commit.h)
 
-cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
-cc_library(custom_kernel SRCS custom_kernel.cc DEPS
-           tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api)
+cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw)
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
@@ -451,7 +450,7 @@ if(WITH_TESTING AND TEST selected_rows_utils_test)
 endif()
 
 cc_test(scope_guard_test SRCS scope_guard_test.cc)
-cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils)
+cc_test(phi_utils_test SRCS phi_utils_test.cc DEPS phi_utils)
 
 if(WITH_GPU OR WITH_ROCM)
   cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
@@ -459,4 +458,3 @@ else()
   cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place)
 endif()
 cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils)
-cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor)
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index 81b6917587df9282d3ff59180e6fc079379cef60..ae3d8379bdbf779e2cf82d27c18997f82cb92095 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/pybind/pybind.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/convert_utils.cc b/paddle/fluid/framework/convert_utils.cc
index 23cf4324086bd48f7a2a429bde26f7303e8d34b3..df5cc6d82042c262467b35f6a7cbe097a4ad7776 100644
--- a/paddle/fluid/framework/convert_utils.cc
+++ b/paddle/fluid/framework/convert_utils.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-paddle::experimental::DataType TransToPtenDataType(
+paddle::experimental::DataType TransToPhiDataType(
     const paddle::framework::proto::VarType::Type& dtype) {
   // Set the order of case branches according to the frequency with
   // the data type is used
diff --git a/paddle/fluid/framework/convert_utils.h b/paddle/fluid/framework/convert_utils.h
index c94b5b2311c5202832e5fe00c702e14c56ada9b9..da2af86c77c477c3c70b220b47bc073b47645a5d 100644
--- a/paddle/fluid/framework/convert_utils.h
+++ b/paddle/fluid/framework/convert_utils.h
@@ -32,7 +32,7 @@ namespace framework {
 using DataType = paddle::experimental::DataType;
 using DataLayout = paddle::experimental::DataLayout;
 
-DataType TransToPtenDataType(
+DataType TransToPhiDataType(
     const paddle::framework::proto::VarType::Type& dtype);
 
 paddle::framework::proto::VarType::Type TransToProtoVarType(
diff --git a/paddle/fluid/framework/convert_utils_test.cc b/paddle/fluid/framework/convert_utils_test.cc
index 51b431f4b4a8a080f312f7d8bfdf12c1cdc44e4b..140806dfd7c5e1ae2746f3d116f418fea16fa1f3 100644
--- a/paddle/fluid/framework/convert_utils_test.cc
+++ b/paddle/fluid/framework/convert_utils_test.cc
@@ -43,35 +43,35 @@ TEST(ConvertUtils, DataType) {
   CHECK(paddle::framework::TransToProtoVarType(paddle::DataType::FLOAT16) ==
         paddle::framework::proto::VarType::FP16);
   // proto -> enum
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::FP64) ==
         paddle::DataType::FLOAT64);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::FP32) ==
         paddle::DataType::FLOAT32);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::INT64) ==
         paddle::DataType::INT64);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::INT32) ==
         paddle::DataType::INT32);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::INT8) == paddle::DataType::INT8);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::UINT8) ==
         paddle::DataType::UINT8);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::INT16) ==
         paddle::DataType::INT16);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::COMPLEX64) ==
         paddle::DataType::COMPLEX64);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::COMPLEX128) ==
         paddle::DataType::COMPLEX128);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::FP16) ==
         paddle::DataType::FLOAT16);
 }
diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc
index 3a00d9424646a5d7caae251edc55c62e5d024105..49a1e0774a6b1a7a1afd154029850ceb52040759 100644
--- a/paddle/fluid/framework/custom_kernel.cc
+++ b/paddle/fluid/framework/custom_kernel.cc
@@ -18,355 +18,24 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/framework/custom_kernel.h"
-#include <dirent.h>
-#include <algorithm>
-#include <regex>
-#include "paddle/fluid/framework/op_kernel_info_helper.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/api/ext/op_kernel_info.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/kernel_context.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/custom_kernel.h"
 
 namespace paddle {
-
 namespace framework {
 
-// set phi::Kernel args_def_ from op_kernel_info
-// because we can not set directly to phi::Kernel without exposing
-// phi::KernelArgsDef when parsing custom user function
-static void ParseArgs(const OpKernelInfo& op_kernel_info,
-                      phi::KernelArgsDef* args_def) {
-  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
-  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
-  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
-
-  for (auto& input : input_defs) {
-    auto type_index =
-        input.is_vector
-            ? std::type_index(typeid(const std::vector<phi::DenseTensor>&))
-            : std::type_index(typeid(const phi::DenseTensor&));
-    args_def->AppendInput(input.backend, input.layout, input.dtype, type_index);
-  }
-  for (auto& output : output_defs) {
-    auto type_index =
-        output.is_vector
-            ? std::type_index(typeid(const std::vector<phi::DenseTensor>&))
-            : std::type_index(typeid(const phi::DenseTensor&));
-    args_def->AppendOutput(output.backend, output.layout, output.dtype,
-                           type_index);
-  }
-  for (auto& attr : attribute_defs) {
-    args_def->AppendAttribute(attr.type_index);
-  }
-}
-
-// custom pten kernel call function define
-static void RunKernelFunc(phi::KernelContext* ctx,
-                          const OpKernelInfo& op_kernel_info) {
-  VLOG(3) << "[CUSTOM KERNEL] RunKernelFunc begin...";
-
-  // input and output size is not params' num
-  // but actual Tensors' size
-  size_t input_size = ctx->InputsSize();
-  size_t output_size = ctx->OutputsSize();
-  size_t attr_size = ctx->AttrsSize();
-
-  // parameters' num of unified user kernel function
-  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
-  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
-  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
-
-  PADDLE_ENFORCE_GE(input_size, input_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of ctx inputs size (%d) must be larger than "
-                        "the size of kernel input_defs (%d).",
-                        input_size, input_defs.size()));
-
-  PADDLE_ENFORCE_GE(output_size, output_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of ctx outputs size (%d) must be larger than "
-                        "the size of kernel output_defs (%d).",
-                        output_size, output_defs.size()));
-
-  PADDLE_ENFORCE_EQ(attr_size, attribute_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of ctx attribute size (%d) must be equal to "
-                        "to the size of kernel attribute_defs (%d).",
-                        attr_size, attribute_defs.size()));
-
-  VLOG(3) << "[CUSTOM KERNEL] Input num: " << input_defs.size()
-          << "[tensor size:" << input_size << "]"
-          << " Attribute num: " << attribute_defs.size()
-          << " Output num: " << output_defs.size()
-          << "[tensor size:" << output_size << "].";
-
-  // Inputs mapping
-  std::vector<paddle::experimental::Tensor> custom_ins;
-  std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
-  for (size_t in_idx = 0; in_idx < input_defs.size(); ++in_idx) {
-    VLOG(3) << "Mapping Input[" << in_idx << "]";
-    const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-
-    // is_vector tells if this Input is Tensor or std::vector<Tensor>
-    if (!input_defs.at(in_idx).is_vector) {
-      paddle::experimental::Tensor custom_t;
-      auto& ctx_tensor = ctx->InputAt<phi::DenseTensor>(range.first);
-      custom_t.set_impl(std::make_shared<phi::DenseTensor>(ctx_tensor));
-      custom_ins.emplace_back(custom_t);
-    } else {
-      std::vector<paddle::experimental::Tensor> custom_vec_in;
-      auto ctx_tensor_vec =
-          ctx->MoveInputsBetween<phi::DenseTensor>(range.first, range.second);
-      for (auto& ctx_tensor : ctx_tensor_vec) {
-        paddle::experimental::Tensor custom_t;
-        custom_t.set_impl(std::make_shared<phi::DenseTensor>(ctx_tensor));
-        custom_vec_in.emplace_back(custom_t);
-      }
-      custom_vec_ins.emplace_back(custom_vec_in);
-    }
-    VLOG(3) << "Mapped Input[" << in_idx << "] with range[" << range.first
-            << "," << range.second << ").";
-  }
-
-  // Attributes mapping
-  std::vector<paddle::any> custom_attrs;
-  for (size_t attr_idx = 0; attr_idx < attribute_defs.size(); ++attr_idx) {
-    VLOG(3) << "Mapping Attribute[" << attr_idx << "]";
-    if (attribute_defs[attr_idx].type_index == std::type_index(typeid(bool))) {
-      bool arg = ctx->AttrAt<bool>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(int))) {
-      int arg = ctx->AttrAt<int>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(float))) {
-      float arg = ctx->AttrAt<float>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(double))) {
-      double arg = ctx->AttrAt<double>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(int64_t))) {
-      int64_t arg = ctx->AttrAt<int64_t>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(phi::dtype::float16))) {
-      phi::dtype::float16 arg = ctx->AttrAt<phi::dtype::float16>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(DataType))) {
-      DataType arg = ctx->AttrAt<DataType>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const Scalar&))) {
-      const Scalar& arg = ctx->AttrAt<const Scalar&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const std::vector<int64_t>&))) {
-      const std::vector<int64_t>& arg =
-          ctx->AttrAt<const std::vector<int64_t>&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const ScalarArray&))) {
-      const ScalarArray& arg = ctx->AttrAt<const ScalarArray&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const std::vector<int>&))) {
-      const std::vector<int>& arg =
-          ctx->AttrAt<const std::vector<int>&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported attribute attribute_defs[%d].type_index", attr_idx));
-    }
-    VLOG(3) << "Mapped Attribute[" << attr_idx << "]";
-  }
-
-  // Outputs mapping
-  std::vector<paddle::experimental::Tensor*> custom_outs;
-  std::vector<std::vector<paddle::experimental::Tensor*>> custom_vec_outs;
-  std::vector<std::shared_ptr<phi::DenseTensor>> custom_outs_ptr;
-  std::vector<std::vector<std::shared_ptr<phi::DenseTensor>>>
-      custom_vec_outs_ptr;
-
-  for (size_t out_idx = 0; out_idx < output_defs.size(); ++out_idx) {
-    VLOG(3) << "Mapping Output[" << out_idx << "]";
-    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-
-    // is_vector tells if this Output is Tensor or std::vector<Tensor>
-    if (!output_defs.at(out_idx).is_vector) {
-      auto* ctx_tensor = ctx->MutableOutputAt<phi::DenseTensor>(range.first);
-      auto* custom_t = new paddle::experimental::Tensor();
-      auto custom_t_ptr = std::make_shared<phi::DenseTensor>(*ctx_tensor);
-      custom_t->set_impl(custom_t_ptr);
-      custom_outs.emplace_back(custom_t);
-      custom_outs_ptr.emplace_back(custom_t_ptr);
-    } else {
-      std::vector<paddle::experimental::Tensor*> custom_vec_out;
-      std::vector<std::shared_ptr<phi::DenseTensor>> custom_vec_out_ptr;
-      auto ctx_tensor_vec = ctx->MutableOutputBetween<phi::DenseTensor>(
-          range.first, range.second);
-      for (auto ctx_tensor : ctx_tensor_vec) {
-        auto* custom_t = new paddle::experimental::Tensor();
-        auto custom_t_ptr = std::make_shared<phi::DenseTensor>(*ctx_tensor);
-        custom_t->set_impl(custom_t_ptr);
-        custom_vec_out.emplace_back(custom_t);
-        custom_vec_out_ptr.emplace_back(custom_t_ptr);
-      }
-      custom_vec_outs.emplace_back(custom_vec_out);
-      custom_vec_outs_ptr.emplace_back(custom_vec_out_ptr);
-    }
-    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
-            << "," << range.second << ").";
-  }
-
-  // DeviceContext
-  // In pten, the first paramter XXContext is decided when registering
-  // through template param, but custom kernel function use unified
-  // DeviceContext as first parameter of user_kernel_fn, we use backend
-  // from OpKernelInfo to decide XXContext. In temporary simple
-  // DeviceContext, we just set necessary info to dev_ctx(such as stream
-  // in NPUContext), more related work should be done when
-  // phi::DeviceContext is exposed to outer.
-  DeviceContext dev_ctx;
-  auto& backend = OpKernelInfoHelper::GetBackend(op_kernel_info);
-  if (backend == phi::Backend::CPU) {
-    // do nothing
-  } else {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-    size_t device_type_id_ = static_cast<size_t>(backend) -
-                             static_cast<size_t>(phi::Backend::ALL_BACKEND);
-    std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
-    if (!device_type.empty()) {
-      auto custom_ctx =
-          ctx->GetDeviceContext<paddle::platform::CustomDeviceContext>();
-      dev_ctx.set_stream(custom_ctx.stream());
-      return;
-    }
-#endif
-    LOG(ERROR) << "[CUSTOM KERNEL] Unsupported kernel backend: " << backend
-               << " with compiled Paddle.";
-    return;
-  }
-
-  auto& user_kernel_fn = OpKernelInfoHelper::GetKernelFn(op_kernel_info);
-  // call user function
-  user_kernel_fn(dev_ctx, custom_ins, custom_vec_ins, custom_attrs,
-                 &custom_outs, &custom_vec_outs);
-
-  VLOG(3) << "[CUSTOM KERNEL] finished call user kernel function.";
-
-  // NOTE: Map back the output tensors with stored shared_ptrs.
-  for (int out_idx = output_defs.size() - 1; out_idx >= 0; --out_idx) {
-    VLOG(3) << "Mapping Back Output[" << out_idx << "]";
-    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-
-    // is_vector tells if this Output is Tensor or std::vector<Tensor>
-    if (!output_defs.at(out_idx).is_vector) {
-      auto* ctx_tensor = ctx->MutableOutputAt<phi::DenseTensor>(range.first);
-      *ctx_tensor = *(custom_outs_ptr.back().get());
-      custom_outs_ptr.pop_back();
-    } else {
-      auto ctx_tensor_vec = ctx->MutableOutputBetween<phi::DenseTensor>(
-          range.first, range.second);
-      auto custom_vec_ptr_out = custom_vec_outs_ptr.back();
-      for (int idx = ctx_tensor_vec.size() - 1; idx >= 0; --idx) {
-        *(ctx_tensor_vec[idx]) = *(custom_vec_ptr_out.back().get());
-        custom_vec_ptr_out.pop_back();
-      }
-      custom_vec_outs_ptr.pop_back();
-    }
-    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
-            << "," << range.second << "].";
-  }
-
-  // delete newed paddle::Tensor for outputs while calling user kernel function
-  for (size_t i = 0; i < custom_outs.size(); ++i) {
-    delete custom_outs[i];
-  }
-  for (size_t i = 0; i < custom_vec_outs.size(); ++i) {
-    for (size_t j = 0; j < custom_vec_outs[i].size(); ++j) {
-      delete custom_vec_outs[i][j];
-    }
-  }
-}
-
-void RegisterKernelWithMetaInfo(
-    const std::vector<OpKernelInfo>& op_kernel_infos) {
-  for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
-    auto& kernel_info = op_kernel_infos[i];
-    auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);
-    auto kernel_key = OpKernelInfoHelper::GetKernelKey(kernel_info);
-
-    VLOG(3) << "[CUSTOM KERNEL] registering [" << op_type << "]" << kernel_key;
-
-    // 1.Check whether this kernel is valid for a specific operator
-    PADDLE_ENFORCE_EQ(
-        phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type), true,
-        platform::errors::InvalidArgument(
-            "[CUSTOM KERNEL] %s is not ready for custom kernel registering.",
-            op_type));
-
-    // 2.Check whether kernel_key has been already registed
-    PADDLE_ENFORCE_EQ(
-        phi::KernelFactory::Instance().kernels()[op_type].find(kernel_key),
-        phi::KernelFactory::Instance().kernels()[op_type].end(),
-        platform::errors::InvalidArgument(
-            "[CUSTOM KERNEL] The operator <%s>'s kernel: %s has been "
-            "already existed in Paddle, please contribute PR if need "
-            "to optimize the kernel code. Custom kernel do NOT support "
-            "to replace existing kernel in Paddle.",
-            op_type, kernel_key));
-
-    // phi::KernelFn
-    phi::KernelFn kernel_fn = [kernel_info](phi::KernelContext* ctx) {
-      VLOG(3) << "[CUSTOM KERNEL] run custom PTEN kernel func in lambda.";
-      RunKernelFunc(ctx, kernel_info);
-    };
-    // variadic_kernel_fn
-    void* variadic_kernel_fn =
-        OpKernelInfoHelper::GetVariadicKernelFn(kernel_info);
-    phi::Kernel kernel(kernel_fn, variadic_kernel_fn);
-    // args info
-    ParseArgs(kernel_info, kernel.mutable_args_def());
-    // register custom kernel to phi::KernelFactory
-    phi::KernelFactory::Instance().kernels()[op_type][kernel_key] = kernel;
-    VLOG(3) << "[CUSTOM KERNEL] Successed in registering operator <" << op_type
-            << ">'s kernel " << kernel_key << " to Paddle. "
-            << "It will be used like native ones.";
-  }
-}
-
-void RegisterKernelWithMetaInfoMap(
-    const paddle::OpKernelInfoMap& op_kernel_info_map) {
-  auto& kernel_info_map = op_kernel_info_map.GetMap();
-  VLOG(3) << "[CUSTOM KERNEL] size of op_kernel_info_map: "
-          << kernel_info_map.size();
-
-  // pair: {op_type, OpKernelInfo}
-  for (auto& pair : kernel_info_map) {
-    VLOG(3) << "[CUSTOM KERNEL] pair first -> op name: " << pair.first;
-    RegisterKernelWithMetaInfo(pair.second);
-  }
-}
-
 void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
 #ifdef _LINUX
-  typedef OpKernelInfoMap& get_op_kernel_info_map_t();
-  auto* func = reinterpret_cast<get_op_kernel_info_map_t*>(
-      dlsym(dso_handle, "PD_GetOpKernelInfoMap"));
+  typedef phi::CustomKernelMap& get_custom_kernel_map_t();
+  auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
+      dlsym(dso_handle, "PD_GetCustomKernelMap"));
 
   if (func == nullptr) {
     LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
-                 << "PD_GetOpKernelInfoMap symbol in this lib.";
+                 << "PD_GetCustomKernelMap symbol in this lib.";
     return;
   }
-  auto& op_kernel_info_map = func();
-  RegisterKernelWithMetaInfoMap(op_kernel_info_map);
+  auto& custom_kernel_map = func();
+  phi::RegisterCustomKernels(custom_kernel_map);
   LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
 #else
   VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/fluid/framework/custom_kernel.h
index 30bccc97000f8847ddcf7ebddb4eabd6a6992afe..31084a34413ea4324c69062303ef84621a463aaf 100644
--- a/paddle/fluid/framework/custom_kernel.h
+++ b/paddle/fluid/framework/custom_kernel.h
@@ -14,22 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/ext/op_kernel_info.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
 
+// Load custom kernel lib and register
 void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
 
-// Load custom kernel api: register kernel after user compiled
-void LoadOpKernelInfoAndRegister(const std::string& dso_name);
-
-// Register custom kernel api: register kernel directly
-void RegisterKernelWithMetaInfoMap(
-    const paddle::OpKernelInfoMap& op_kernel_info_map);
-
-// Interface for selective register custom kernel.
-void RegisterKernelWithMetaInfo(
-    const std::vector<OpKernelInfo>& op_kernel_infos);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 597265bb2473fd14108b4fa11e7ae93957c4268b..b9e3bee25f6b5377dde7b525138643964fd8366a 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/string/string_helper.h"
@@ -779,13 +779,13 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
           for (size_t i = 0; i < ctx->InputSize(in_name); ++i) {
             auto dtype = ctx->GetInputDataType(in_name, i);
             vec_custom_dtype.emplace_back(
-                paddle::framework::TransToPtenDataType(dtype));
+                paddle::framework::TransToPhiDataType(dtype));
           }
           vec_input_dtypes.emplace_back(vec_custom_dtype);
         } else {
           auto dtype = ctx->GetInputDataType(in_name);
           input_dtypes.emplace_back(
-              paddle::framework::TransToPtenDataType(dtype));
+              paddle::framework::TransToPhiDataType(dtype));
         }
       }
 
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index cf9e3de6c1a58a277e4508442c39a882ffa506b2..4757eb60f4361cffd9354afd4a8bf4bf99e86eb3 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 7152004b63de6deab22988a79917b536a0623c81..15cf30c1cf352324b57b8ca7bfcdf9d2d2640aea 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -28,7 +28,7 @@ TEST(DataType, float16) {
 
   Tensor tensor;
   CPUPlace cpu;
-  tensor.mutable_data(cpu, f::TransToPtenDataType(dtype));
+  tensor.mutable_data(cpu, f::TransToPhiDataType(dtype));
 
   // test fp16 tensor
   EXPECT_EQ(f::TransToProtoVarType(tensor.dtype()),
@@ -51,7 +51,7 @@ TEST(DataType, bfloat16) {
 
   Tensor tensor;
   CPUPlace cpu;
-  tensor.mutable_data(cpu, f::TransToPtenDataType(dtype));
+  tensor.mutable_data(cpu, f::TransToPhiDataType(dtype));
 
   // test bf16 tensor
   EXPECT_EQ(f::TransToProtoVarType(tensor.dtype()),
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 1cf69a1a3d652a49226447c5559613378bd3ee17..1b2b24762894c0d72e75f0c4d20531e21f146cfd 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -231,6 +231,8 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
     OpHandleBase *op,
     const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
   ++remaining_;
+  platform::RecordEvent("WorkQueue::AddTask",
+                        platform::TracerEventType::UserDefined, 10 /*level*/);
   this->pool_->enqueue([=] {
     std::deque<OpHandleBase *> op_queue;
     op_queue.push_front(op);
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
index a6232667193438d9ca2346cd573d60e5dc5f802a..44b9ca90fc540b39d5b3ae53f3ddcee2c8d74d6f 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 DECLARE_bool(sync_nccl_allreduce);
@@ -47,6 +48,8 @@ GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle(
 #endif
 
 void GradMergeAllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(
+      Name(), platform::TracerEventType::Communication, 1);
   PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
                     platform::errors::PreconditionNotMet(
                         "The number of local scope should be > 0, but got %zu.",
@@ -96,6 +99,8 @@ FusedGradMergeAllReduceOpHandle::FusedGradMergeAllReduceOpHandle(
 #endif
 
 void FusedGradMergeAllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(
+      Name(), platform::TracerEventType::Communication, 1);
   PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
                     platform::errors::PreconditionNotMet(
                         "The number of local scope should be > 0, but got %zu.",
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 47ab1e0fc030a6897162a99e8eb4da5e34541c79..06019372a7323b3c61c067638da19b847eba9031 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 189724a5455200bdfbd0497aee53bc949df412e9..17346f5fd939324e6c2d709fb09be2cb65669429 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -10,6 +10,8 @@ IF(WITH_GPU)
     nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
+    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm)
+    nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6508bf96c00f835da4aee79503f16fa5451e794
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "heter_comm.h"
+#include "paddle/fluid/platform/enforce.h"
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+struct GpuPsGraphNode {
+  int64_t node_id;
+  int neighbor_size, neighbor_offset;
+  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
+  // neighbor_size) of int64_t *neighbor_list;
+};
+
+struct GpuPsCommGraph {
+  int64_t *neighbor_list;
+  GpuPsGraphNode *node_list;
+  int neighbor_size, node_size;
+  // the size of neighbor array and graph_node_list array
+  GpuPsCommGraph()
+      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
+  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
+                 int neighbor_size_, int node_size_)
+      : neighbor_list(neighbor_list_),
+        node_list(node_list_),
+        neighbor_size(neighbor_size_),
+        node_size(node_size_) {}
+};
+
+/*
+suppose we have a graph like this
+
+0----3-----5----7
+ \   |\         |\
+ 17  8 9        1 2
+
+we save the nodes in arbitrary order,
+in this example,the order is
+[0,5,1,2,7,3,8,9,17]
+let us name this array u_id;
+we record each node's neighbors:
+0:3,17
+5:3,7
+1:7
+2:7
+7:1,2,5
+3:0,5,8,9
+8:3
+9:3
+17:0
+
+by concatenating each node's neighbor_list in the order we save the node id.
+we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
+this is the neighbor_list of GpuPsCommGraph
+given this neighbor_list and the order to save node id,
+we know,
+node 0's neighbors are in the range [0,1] of neighbor_list
+node 5's neighbors are in the range [2,3] of neighbor_list
+node 1's neighbors are in the range [4,4] of neighbor_list
+node 2:[5,5]
+node 7:[6,6]
+node 3:[9,12]
+node 8:[13,13]
+node 9:[14,14]
+node 17:[15,15]
+...
+by the above information,
+we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
+of size 9,
+where node_list[i].id = u_id[i]
+then we have:
+node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
+node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
+node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
+node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
+node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
+node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
+node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
+node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
+node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
+*/
+struct NeighborSampleResult {
+  int64_t *val;
+  int *actual_sample_size, sample_size, key_size;
+  NeighborSampleResult(int _sample_size, int _key_size)
+      : sample_size(_sample_size), key_size(_key_size) {
+    actual_sample_size = NULL;
+    val = NULL;
+  };
+  ~NeighborSampleResult() {
+    if (val != NULL) cudaFree(val);
+    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
+  }
+};
+
+struct NodeQueryResult {
+  int64_t *val;
+  int actual_sample_size;
+  NodeQueryResult() {
+    val = NULL;
+    actual_sample_size = 0;
+  };
+  ~NodeQueryResult() {
+    if (val != NULL) cudaFree(val);
+  }
+};
+class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
+ public:
+  GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource)
+      : HeterComm<int64_t, int, int>(1, resource) {
+    load_factor_ = 0.25;
+  }
+  void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
+  NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
+  NeighborSampleResult *graph_neighbor_sample(int gpu_id, int64_t *key,
+                                              int sample_size, int len);
+  NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
+  void clear_graph_info();
+  void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
+                                                 int sample_size, int *h_left,
+                                                 int *h_right,
+                                                 int64_t *src_sample_res,
+                                                 int *actual_sample_size);
+
+ private:
+  std::vector<GpuPsCommGraph> gpu_graph_list;
+};
+}
+};
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..839c7e5468c6c6938c6b4cda3dd879c7366e7d6e
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -0,0 +1,447 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+/*
+comment 0
+this kernel just serves as an example of how to sample nodes' neighbors.
+feel free to modify it
+index[0,len) saves the nodes' index
+actual_size[0,len) is to save the sample size of each node.
+for ith node in index, actual_size[i] = min(node i's neighbor size, sample size)
+sample_result is to save the neighbor sampling result, its size is len *
+sample_size;
+
+*/
+
+__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
+                                        int* actual_size,
+                                        int64_t* sample_result, int sample_size,
+                                        int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto node_index = index[i];
+    actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size
+                         ? graph.node_list[node_index].neighbor_size
+                         : sample_size;
+    int offset = graph.node_list[node_index].neighbor_offset;
+    for (int j = 0; j < actual_size[i]; j++) {
+      sample_result[sample_size * i + j] = graph.neighbor_list[offset + j];
+    }
+  }
+}
+
+/*
+ comment 1
+
+ gpu i triggers a neighbor_sample task,
+ when this task is done,
+ this function is called to move the sample result on other gpu back
+ to gup i and aggragate the result.
+ the sample_result is saved on src_sample_res and the actual sample size for
+ each node is saved on actual_sample_size.
+ the number of actual sample_result for
+ key[x] (refer to comment 2 for definition of key)
+ is saved on  actual_sample_size[x], since the neighbor size of key[x] might be
+ smaller than sample_size,
+ is saved on src_sample_res [x*sample_size, x*sample_size +
+ actual_sample_size[x])
+
+ since before each gpu runs the neighbor_sample task,the key array is shuffled,
+ but we have the idx array to save the original order.
+ when the gpu i gets all the sample results from other gpus, it relies on
+ idx array to recover the original order.
+ that's what fill_dvals does.
+
+*/
+void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
+    int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right,
+    int64_t* src_sample_res, int* actual_sample_size) {
+  for (int i = 0; i < gpu_num; i++) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    // int cur_step = path_[gpu_id][i].nodes_.size() - 1;
+    // auto& node = path_[gpu_id][i].nodes_[cur_step];
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaMemcpyAsync(
+        reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
+        node.val_storage + sizeof(int64_t) * shard_len,
+        node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault,
+        node.out_stream);
+    cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
+                    node.val_storage + sizeof(int) * shard_len,
+                    sizeof(int) * shard_len, cudaMemcpyDefault,
+                    node.out_stream);
+  }
+  for (int i = 0; i < gpu_num; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaStreamSynchronize(node.out_stream);
+  }
+}
+
+/*
+TODO:
+how to optimize it to eliminate the for loop
+*/
+__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
+                             int* d_shard_actual_sample_size,
+                             int* d_actual_sample_size, int* idx,
+                             int sample_size, int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i];
+    // d_vals[idx[i]] = d_shard_vals[i];
+    for (int j = 0; j < sample_size; j++) {
+      d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
+    }
+  }
+}
+
+__global__ void node_query_example(GpuPsCommGraph graph, int start, int size,
+                                   int64_t* res) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    res[i] = graph.node_list[start + i].node_id;
+  }
+}
+
+void GpuPsGraphTable::clear_graph_info() {
+  if (tables_.size()) {
+    for (auto table : tables_) delete table;
+  }
+  tables_.clear();
+  for (auto graph : gpu_graph_list) {
+    if (graph.neighbor_list != NULL) {
+      cudaFree(graph.neighbor_list);
+    }
+    if (graph.node_list != NULL) {
+      cudaFree(graph.node_list);
+    }
+  }
+  gpu_graph_list.clear();
+}
+/*
+the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
+it saves the graph to be saved on each gpu.
+
+for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
+== i
+
+In this function, memory is allocated on each gpu to save the graphs,
+gpu i saves the ith graph from cpu_graph_list
+*/
+
+void GpuPsGraphTable::build_graph_from_cpu(
+    std::vector<GpuPsCommGraph>& cpu_graph_list) {
+  PADDLE_ENFORCE_EQ(
+      cpu_graph_list.size(), resource_->total_gpu(),
+      platform::errors::InvalidArgument("the cpu node list size doesn't match "
+                                        "the number of gpu on your machine."));
+  clear_graph_info();
+  for (int i = 0; i < cpu_graph_list.size(); i++) {
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    gpu_graph_list.push_back(GpuPsCommGraph());
+    auto table =
+        new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_);
+    tables_.push_back(table);
+    if (cpu_graph_list[i].node_size > 0) {
+      std::vector<int64_t> keys;
+      std::vector<int> offset;
+      cudaMalloc((void**)&gpu_graph_list[i].node_list,
+                 cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
+      cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list,
+                 cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode),
+                 cudaMemcpyHostToDevice);
+      for (int j = 0; j < cpu_graph_list[i].node_size; j++) {
+        keys.push_back(cpu_graph_list[i].node_list[j].node_id);
+        offset.push_back(j);
+      }
+      build_ps(i, keys.data(), offset.data(), keys.size(), 1024, 8);
+      gpu_graph_list[i].node_size = cpu_graph_list[i].node_size;
+    } else {
+      gpu_graph_list[i].node_list = NULL;
+      gpu_graph_list[i].node_size = 0;
+    }
+    if (cpu_graph_list[i].neighbor_size) {
+      cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
+                 cpu_graph_list[i].neighbor_size * sizeof(int64_t));
+      cudaMemcpy(gpu_graph_list[i].neighbor_list,
+                 cpu_graph_list[i].neighbor_list,
+                 cpu_graph_list[i].neighbor_size * sizeof(int64_t),
+                 cudaMemcpyHostToDevice);
+      gpu_graph_list[i].neighbor_size = cpu_graph_list[i].neighbor_size;
+    } else {
+      gpu_graph_list[i].neighbor_list = NULL;
+      gpu_graph_list[i].neighbor_size = 0;
+    }
+  }
+  cudaDeviceSynchronize();
+}
+NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
+                                                             int64_t* key,
+                                                             int sample_size,
+                                                             int len) {
+  /*
+ comment 2
+  this function shares some kernels with heter_comm_inl.h
+  arguments definitions:
+  gpu_id:the id of gpu.
+  len:how many keys are used,(the length of array key)
+  sample_size:how many neighbors should be sampled for each node in key.
+
+  the code below shuffle the key array to make the keys
+    that belong to a gpu-card stay together,
+    the shuffled result is saved on d_shard_keys,
+    if ith element in d_shard_keys_ptr is
+    from jth element in the original key array, then idx[i] = j,
+    idx could be used to recover the original array.
+    if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] =
+ b,
+    if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1
+
+    for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2
+    when we run this neighbor_sample function,
+    the key is shuffled to [0,2,4,6,8,1,3,5,7]
+    the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0,
+    the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1,
+    h_left = [0,5],h_right = [4,8]
+
+  */
+  NeighborSampleResult* result = new NeighborSampleResult(sample_size, len);
+  if (len == 0) {
+    return result;
+  }
+  cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
+  cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
+  int* actual_sample_size = result->actual_sample_size;
+  int64_t* val = result->val;
+  int total_gpu = resource_->total_gpu();
+  int dev_id = resource_->dev_id(gpu_id);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_id, 0);
+
+  int grid_size = (len - 1) / block_size_ + 1;
+
+  int h_left[total_gpu];   // NOLINT
+  int h_right[total_gpu];  // NOLINT
+
+  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+
+  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
+  //
+  auto d_idx = memory::Alloc(place, len * sizeof(int));
+  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
+
+  auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
+  int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
+  auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t));
+  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
+  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
+  int* d_shard_actual_sample_size_ptr =
+      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
+
+  split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
+
+  fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr, key,
+                                                        d_idx_ptr, len);
+
+  cudaStreamSynchronize(stream);
+
+  cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    /*
+   comment 3
+    shard_len denotes the size of keys on i-th gpu here,
+    when we sample  on i-th gpu, we allocate shard_len * (1 + sample_size)
+   int64_t units
+    of memory, we use alloc_mem_i to denote it, the range [0,shard_len) is saved
+   for the respective nodes' indexes
+    and acutal sample_size.
+    with nodes' indexes we could get the nodes to sample.
+    since size of int64_t is 8 bits, while size of int is 4,
+    the range of [0,shard_len) contains shard_len * 2 int uinits;
+    The values of the first half of this range will be updated by
+    the k-v map on i-th-gpu.
+    The second half of this range is saved for actual sample size of each node.
+    For node x,
+    its sampling result is saved on the range
+    [shard_len + sample_size * x,shard_len + sample_size * x +
+   actual_sample_size_of_x)
+    of alloc_mem_i, actual_sample_size_of_x equals ((int
+   *)alloc_mem_i)[shard_len + x]
+    */
+    create_storage(gpu_id, i, shard_len * sizeof(int64_t),
+                   shard_len * (1 + sample_size) * sizeof(int64_t));
+  }
+  walk_to_dest(gpu_id, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    // auto& node = path_[gpu_id][i].nodes_.back();
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaStreamSynchronize(node.in_stream);
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    // use the key-value map to update alloc_mem_i[0,shard_len)
+    tables_[i]->rwlock_->RDLock();
+    tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
+                    reinterpret_cast<int*>(node.val_storage),
+                    h_right[i] - h_left[i] + 1,
+                    resource_->remote_stream(i, gpu_id));
+  }
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    // cudaStreamSynchronize(resource_->remote_stream(i, num));
+    // tables_[i]->rwlock_->UNLock();
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    auto& node = path_[gpu_id][i].nodes_.front();
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    auto graph = gpu_graph_list[i];
+    int* res_array = reinterpret_cast<int*>(node.val_storage);
+    int* actual_size_array = res_array + shard_len;
+    int64_t* sample_array = (int64_t*)(res_array + shard_len * 2);
+    neighbor_sample_example<<<grid_size, block_size_, 0,
+                              resource_->remote_stream(i, gpu_id)>>>(
+        graph, res_array, actual_size_array, sample_array, sample_size,
+        shard_len);
+  }
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
+    tables_[i]->rwlock_->UNLock();
+  }
+  // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr);
+  move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
+                                            h_left, h_right, d_shard_vals_ptr,
+                                            d_shard_actual_sample_size_ptr);
+
+  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
+      d_idx_ptr, sample_size, len);
+  cudaStreamSynchronize(stream);
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    destroy_storage(gpu_id, i);
+  }
+  return result;
+}
+
+NodeQueryResult* GpuPsGraphTable::graph_node_sample(int gpu_id,
+                                                    int sample_size) {}
+
+NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
+                                                  int query_size) {
+  NodeQueryResult* result = new NodeQueryResult();
+  if (query_size <= 0) return result;
+  int& actual_size = result->actual_sample_size;
+  actual_size = 0;
+  cudaMalloc((void**)&result->val, query_size * sizeof(int64_t));
+  int64_t* val = result->val;
+  int dev_id = resource_->dev_id(gpu_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  std::vector<int> idx, gpu_begin_pos, local_begin_pos, sample_size;
+  int size = 0;
+  /*
+  if idx[i] = a, gpu_begin_pos[i] = p1,
+  gpu_local_begin_pos[i] = p2;
+  sample_size[i] = s;
+  then on gpu a, the nodes of positions [p1,p1 + s) should be returned
+  and saved from the p2 position on the sample_result array
+
+  for example:
+  suppose
+  gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7]
+  start = 3, query_size = 5
+  we know [6,8,1,3,5] should be returned;
+  idx = [0,1]
+  gpu_begin_pos = [3,0]
+  local_begin_pos = [0,3]
+  sample_size = [2,3]
+
+  */
+  for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) {
+    auto graph = gpu_graph_list[i];
+    if (graph.node_size == 0) {
+      continue;
+    }
+    if (graph.node_size + size > start) {
+      int cur_size = min(query_size, graph.node_size + size - start);
+      query_size -= cur_size;
+      idx.emplace_back(i);
+      gpu_begin_pos.emplace_back(start - size);
+      local_begin_pos.emplace_back(actual_size);
+      start += cur_size;
+      actual_size += cur_size;
+      sample_size.emplace_back(cur_size);
+      create_storage(gpu_id, i, 1, cur_size * sizeof(int64_t));
+    }
+    size += graph.node_size;
+  }
+  for (int i = 0; i < idx.size(); i++) {
+    int dev_id_i = resource_->dev_id(idx[i]);
+    platform::CUDADeviceGuard guard(dev_id_i);
+    auto& node = path_[gpu_id][idx[i]].nodes_.front();
+    int grid_size = (sample_size[i] - 1) / block_size_ + 1;
+    node_query_example<<<grid_size, block_size_, 0,
+                         resource_->remote_stream(idx[i], gpu_id)>>>(
+        gpu_graph_list[idx[i]], gpu_begin_pos[i], sample_size[i],
+        (int64_t*)node.val_storage);
+  }
+
+  for (int i = 0; i < idx.size(); i++) {
+    cudaStreamSynchronize(resource_->remote_stream(idx[i], gpu_id));
+    auto& node = path_[gpu_id][idx[i]].nodes_.front();
+    cudaMemcpyAsync(reinterpret_cast<char*>(val + local_begin_pos[i]),
+                    node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
+                    node.out_stream);
+  }
+  for (int i = 0; i < idx.size(); i++) {
+    auto& node = path_[gpu_id][idx[i]].nodes_.front();
+    cudaStreamSynchronize(node.out_stream);
+  }
+  return result;
+}
+}
+};
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 7b43e68ff0151e03d426e11ed8266d25a125140a..1fca8cdf8bb801a57ec36ee957b27236f488a4b3 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -173,16 +173,18 @@ class HeterComm {
   void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
                    ValType* src_val);
 
- private:
+ protected:
   using Table = HashTable<KeyType, ValType>;
-  int block_size_{256};
-  float load_factor_{0.75};
   std::vector<Table*> tables_;
   std::shared_ptr<HeterPsResource> resource_;
-  CustomGradMerger merger_;
-  int topo_aware_{0};
   std::vector<std::vector<Path>> path_;
+  float load_factor_{0.75};
+  int block_size_{256};
+
+ private:
   std::vector<LocalStorage> storage_;
+  CustomGradMerger merger_;
+  int topo_aware_{0};
   int feanum_{1800 * 2048};
   int multi_node_{0};
   std::vector<ncclComm_t> nccl_inner_comms_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
new file mode 100644
index 0000000000000000000000000000000000000000..697e0ba2cdf3475d1e7ad48105bc55959461900f
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
@@ -0,0 +1,112 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+using namespace paddle::framework;
+TEST(TEST_FLEET, graph_comm) {
+  int gpu_count = 3;
+  std::vector<int> dev_ids;
+  dev_ids.push_back(0);
+  dev_ids.push_back(1);
+  dev_ids.push_back(2);
+  std::shared_ptr<HeterPsResource> resource =
+      std::make_shared<HeterPsResource>(dev_ids);
+  resource->enable_p2p();
+  GpuPsGraphTable g(resource);
+  int node_count = 10;
+  std::vector<std::vector<int64_t>> neighbors(node_count);
+  int ind = 0;
+  int64_t node_id = 0;
+  std::vector<GpuPsCommGraph> graph_list(gpu_count);
+  while (ind < node_count) {
+    int neighbor_size = ind + 1;
+    graph_list[ind % gpu_count].node_size++;
+    graph_list[ind % gpu_count].neighbor_size += neighbor_size;
+    while (neighbor_size--) {
+      neighbors[ind].push_back(node_id++);
+    }
+    ind++;
+  }
+  std::vector<int> neighbor_offset(gpu_count, 0), node_index(gpu_count, 0);
+  for (int i = 0; i < graph_list.size(); i++) {
+    graph_list[i].node_list = new GpuPsGraphNode[graph_list[i].node_size];
+    graph_list[i].neighbor_list = new int64_t[graph_list[i].neighbor_size];
+  }
+  for (int i = 0; i < node_count; i++) {
+    ind = i % gpu_count;
+    graph_list[ind].node_list[node_index[ind]].node_id = i;
+    graph_list[ind].node_list[node_index[ind]].neighbor_offset =
+        neighbor_offset[ind];
+    graph_list[ind].node_list[node_index[ind]].neighbor_size =
+        neighbors[i].size();
+    for (auto x : neighbors[i]) {
+      graph_list[ind].neighbor_list[neighbor_offset[ind]++] = x;
+    }
+    node_index[ind]++;
+  }
+  g.build_graph_from_cpu(graph_list);
+  /*
+  gpu 0:
+  0,3,6,9
+  gpu 1:
+  1,4,7
+  gpu 2:
+  2,5,8
+
+  query(2,6) returns nodes [6,9,1,4,7,2]
+  */
+  int64_t answer[6] = {6, 9, 1, 4, 7, 2};
+  int64_t *res = new int64_t[6];
+  auto query_res = g.query_node_list(0, 2, 6);
+  cudaMemcpy(res, query_res->val, 48, cudaMemcpyDeviceToHost);
+  ASSERT_EQ(query_res->actual_sample_size, 6);
+  for (int i = 0; i < 6; i++) {
+    ASSERT_EQ(res[i], answer[i]);
+  }
+  delete[] res;
+  delete query_res;
+  /*
+   node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x]
+   so node 6's neighbors are [21,22...,27]
+   node 7's neighbors are [28,29,..35]
+    node 0's neighbors are [0]
+   query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23]
+   6 --index-->2
+   0 --index--->0
+   7 --index-->2
+  */
+  int64_t cpu_key[3] = {7, 0, 6};
+  void *key;
+  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
+  res = new int64_t[9];
+  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
+  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
+  for (int i = 0; i < 9; i++) {
+    if (expected_sample_val[i] != -1) {
+      ASSERT_EQ(res[i], expected_sample_val[i]);
+    }
+  }
+  delete[] res;
+  delete neighbor_sample_res;
+}
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 8c3c1e015262b70efb575b0d3a5ebcd662459170..84dcdad78298acbd74b2f2d23e81ceba4bd71a72 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -161,7 +161,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   tensor->set_lod(lod);
 
   void* tensor_data = tensor->mutable_data(
-      place, framework::TransToPtenDataType(ToVarType(req_var.data_type())));
+      place, framework::TransToPhiDataType(ToVarType(req_var.data_type())));
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
@@ -202,7 +202,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   tensor->set_lod(lod);
 
   void* tensor_data = tensor->mutable_data(
-      place, framework::TransToPtenDataType(ToVarType(req_var.data_type())));
+      place, framework::TransToPhiDataType(ToVarType(req_var.data_type())));
 
 #ifdef PADDLE_WITH_XPU
   memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index 8aafd3459ed1a3d1673e482016c550e69c74a6cd..b6759bb2e6fe6c5a3688f3d72e84aabf0c1d2717 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -38,7 +38,7 @@ void SetMicroId(paddle::framework::Scope* scope,
   std::vector<int> dims{1};
   tensor->Resize(phi::make_ddim(dims));
   void* tensor_data = tensor->mutable_data(
-      place, framework::TransToPtenDataType(framework::proto::VarType::FP32));
+      place, framework::TransToPhiDataType(framework::proto::VarType::FP32));
   if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
     std::vector<char> temp;
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index aae36cf455dfee028b18050bdf431ee4601c479e..e14b91d935d05c12442f3d0205c1e97df9697ec3 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -18,8 +18,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/compat/convert_utils.h"
@@ -143,7 +144,7 @@ class CompatMetaTensor : public phi::MetaTensor {
       }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
-      return paddle::framework::TransToPtenDataType(var->GetDataType());
+      return paddle::framework::TransToPhiDataType(var->GetDataType());
     }
   }
 
@@ -340,24 +341,37 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
           }
           if (infershape_inputs.size() != 1) {
             infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePtenScalarArrayFromVarList(vars)));
+                std::move(experimental::MakePhiScalarArrayFromVarList(vars)));
           } else {
             infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePtenScalarArrayFromVar(*vars[0])));
+                std::move(experimental::MakePhiScalarArrayFromVar(*vars[0])));
           }
         } else {
           // If is not in runtime, we will set default value(-1) for ScalarArray
-          int64_t num_ele = 1;
+          int64_t num_ele = 0;
           std::vector<VarDesc*> vars;
           vars.reserve(infershape_inputs.size());
           for (size_t i = 0; i < infershape_inputs.size(); i++) {
             vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
           }
-          for (auto& var : vars) {
-            const auto& tensor_dims = var->GetShape();
+
+          if (vars.size() == 1) {
+            num_ele = 1;
+            const auto& tensor_dims = vars[0]->GetShape();
             for (size_t i = 0; i < tensor_dims.size(); ++i) {
               num_ele *= tensor_dims[i];
             }
+          } else {
+            for (auto& var : vars) {
+              const auto& tensor_dims = var->GetShape();
+              PADDLE_ENFORCE_EQ(tensor_dims.size(), 1,
+                                platform::errors::InvalidArgument(
+                                    "The shape is constructed by multi-tensor, "
+                                    "every tensor's dims should be 1. But your "
+                                    "shape has tensor that dims is %s.",
+                                    tensor_dims.size()));
+              num_ele += tensor_dims[0];
+            }
           }
           phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
           tensor_attr.SetFromTensor(true);
@@ -376,47 +390,101 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               attr_name));
         }
       }
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(phi::Scalar))) {
+      if (ctx->HasAttr(attr_name)) {
+        // TODO(chentianyu03): support other attrs later
+        auto& attr = attr_reader.GetAttr(attr_name);
+        if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::Scalar(BOOST_GET_CONST(float, attr)));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::string))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::Scalar(BOOST_GET_CONST(std::string, attr)));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(int))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::Scalar(BOOST_GET_CONST(int, attr)));
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported cast op attribute `%s` to Scalar when construct "
+              "InferMetaContext.",
+              attr_name));
+        }
+      } else if (ctx->HasInput(attr_name)) {
+        const auto& infershape_input = ctx->GetInputVarPtrs(attr_name);
 
+        if (infershape_input.size() == 1) {
+          if (ctx->IsRuntime()) {
+            Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
+            infer_meta_context.EmplaceBackAttr(
+                std::move(experimental::MakePhiScalarFromVar(*var)));
+          } else {
+            phi::Scalar tensor_scalar(-1);
+            tensor_scalar.SetFromTensor(true);
+            infer_meta_context.EmplaceBackAttr(std::move(tensor_scalar));
+          }
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Invalid input.size() when cast op attribute `%s` to Scalar, "
+              "expected 1, but actually is %d .",
+              attr_name, infershape_input.size()));
+        }
+      }
     } else if (ctx->HasAttr(attr_name)) {
       // Emplace Back Attr according to the type of attr.
       auto& attr = attr_reader.GetAttr(attr_name);
-      if (std::type_index(attr.type()) == std::type_index(typeid(bool))) {
+      if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
-      } else if (std::type_index(attr.type()) == std::type_index(typeid(int))) {
+      } else if (attr_defs[i].type_index == std::type_index(typeid(int))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(int64_t))) {
+      } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(float))) {
+      } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::string))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<bool>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<bool>, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<int>, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<int64_t>, attr));
-      } else if (std::type_index(attr.type()) ==
+        if (std::type_index(attr.type()) ==
+            std::type_index(typeid(std::vector<int>))) {
+          // Emplace Back Attr according to the type of Phi_Kernel args.
+          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+          const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
+                                                       vector_int_attr.end());
+          infer_meta_context.EmplaceBackAttr(vector_int64_attr);
+        } else {
+          infer_meta_context.EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr));
+        }
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<float>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<float>, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<double>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<double>, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<std::string>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<std::string>, attr));
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(phi::DataType))) {
+        auto data_type = paddle::framework::TransToPhiDataType(
+            static_cast<framework::proto::VarType::Type>(
+                BOOST_GET_CONST(int, attr)));
+        infer_meta_context.EmplaceBackAttr(data_type);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported attribute type is received when call "
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 592e787109d18c45eb872fb720954ed29b073ea4..53dcc19fcbae88ab5ccfcc498037327946029927 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -118,7 +118,7 @@ REGISTER_OPERATOR(infer_shape_utils_test,
                   paddle::framework::InferShapeUtilsTestOpMaker,
                   InferShapeUtilsTestInferShapeFunctor);
 
-PT_REGISTER_KERNEL(infer_shape_utils_test, CPU, ALL_LAYOUT,
+PD_REGISTER_KERNEL(infer_shape_utils_test, CPU, ALL_LAYOUT,
                    paddle::framework::InferShapeUtilsTestKernel, int) {}
 
 TEST(InferShapeUtilsTest, ALL) {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0e1e572a51f7fcbc84415bab3808dfaed97dfd08..dad5358590cb1497453681ce940898314a1d06eb 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -147,7 +147,7 @@ if(WITH_IPU)
     pass_library(ipu_runtime_replacer_pass base DIR ipu)
     pass_library(inference_process_pass base DIR ipu)
     pass_library(inference_postprocess_pass base DIR ipu)
-    pass_library(popart_canonicalization_pass base DIR ipu)
+    pass_library(popart_canonicalization_pass base DIR ipu DEPS paddle_ipu)
     pass_library(ipu_inplace_pass base DIR ipu)
     pass_library(infer_shape_pass base DIR ipu)
     pass_library(delete_scale_op_pass base DIR ipu)
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index ec5d48b3093f7c73bffa0196ccd75e11a89baeac..26ee02ff1812d2e73d0be3bed762d1a4ae4ac6c7 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -276,13 +276,13 @@ bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU(
   bool support_gpu = false;
   auto &kernel_factory = phi::KernelFactory::Instance();
   auto kernel_key_map =
-      kernel_factory.SelectKernelMap(phi::TransToPtenKernelName(op_type));
+      kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type));
   bool has_op_kernel = kernel_key_map.size() > 0 ? true : false;
   for (auto &kernel : kernel_key_map) {
-    if (platform::is_gpu_place(phi::TransToPtenPlace(kernel.first.backend()))) {
+    if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) {
       support_gpu = true;
     } else if (platform::is_cpu_place(
-                   phi::TransToPtenPlace(kernel.first.backend()))) {
+                   phi::TransToPhiPlace(kernel.first.backend()))) {
       support_cpu = true;
     }
   }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 63559e201594a659f698f812086bd6e8b8608827..e4c9dc72128f4850b2e0e4af739fdd381e4a3b1e 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2516,6 +2516,15 @@ PDNode *patterns::DuplicatedInputs::operator()() {
   return op;
 }
 
+PDNode *patterns::DuplicatedOutputs::operator()() {
+  auto op = pattern->NewNode(op_repr())->assert_is_ops({"split"});
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  return op;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
   const std::unordered_set<std::string> &supported_op_types = {
       "abs", "gelu", "leaky_relu", "relu", "softmax", "sqrt", "swish", "tanh"};
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 79f1d63a1519018ecf3d3b18690746a35ab1dd95..d6400ed6945bf8a60c1d4f357bf58a11d5b87094 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1495,6 +1495,15 @@ struct DuplicatedInputs : public PatternBase {
   PATTERN_DECL_NODE(op);
 };
 
+struct DuplicatedOutputs : public PatternBase {
+  DuplicatedOutputs(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "many_outputs_op") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(op);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index 3d8d353cbf530ebe9cc9ea90937b9acf5ddd4a0f..9fe50deaf2d72679bc5c41038936d01cad9de498 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -56,7 +56,7 @@ const bool is_regularization_op(const std::string& op_namescope) {
 }
 
 void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
-  // 这里构建的 op 符合 popart 的定义, 涉及到的一些值需要在 LowerOptimier 时获得
+  // optimizer values will be extracted when lowering optimizer in ipu_backend
   OpDesc new_op("popart_optimizer", {}, {}, {});
   new_op.SetAttr("op_role", 0);
   new_op.SetAttr("with_lr_sched", false);
@@ -86,7 +86,7 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
       bool is_regularization = is_regularization_op(op_namescope);
 
       VLOG(10) << "found optimizer releated op: " << op_type;
-      // initial larning_rate will be set in LowerOptimier
+      // initial larning_rate will be set in ipu_backend
       set_ops.insert(op_type);
       if (op_type == "sgd") {
         auto type = std::string{"sgd"};
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
index 975a4b62cc708859803a2137741caaf413e50210..6806e44f0950535b059e8e7186541ab90973e6ab 100644
--- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
 
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
 
@@ -28,11 +29,8 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
 
   auto custom_ops = Get<std::unordered_set<std::string>>("custom_ops");
   std::vector<std::string> missing_ops;
-  auto nodes = graph->Nodes();
-  for (auto* node : nodes) {
-    if (!node->IsOp()) {
-      continue;
-    }
+  auto sorted_ops = TopologySortOperations(*graph);
+  for (auto* node : sorted_ops) {
     auto* op = node->Op();
     auto op_type = op->Type();
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index dafcc9c4e16a3ee43df17c1c0d650288c31b18b8..e9850483ebe913e298dc7501ed4155fb0dfc2879 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -96,7 +96,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
   auto x = scope->Var(var_name);
   auto tensor = x->GetMutable<LoDTensor>();
   tensor->mutable_data(place,
-                       framework::TransToPtenDataType(proto::VarType::FP32), 1);
+                       framework::TransToPhiDataType(proto::VarType::FP32), 1);
 }
 
 void MainTest(bool convWithExistingBias) {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index 5f9aefc1e7a0bd372a9155a25d3102ceaf9ee1e1..f1bd34a5ad4f6241585c0b00e9ab65b042388c39 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -52,7 +52,7 @@ bool IsPermittedOutputName(const std::string& output_name) {
 }
 
 void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in,
-                 int* quantize_counter) {
+                 int& quantize_counter) {
   std::vector<std::string> input_names;
 
   // Find the name of the input linking op to op_in
@@ -87,10 +87,10 @@ void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in,
   IR_NODE_LINK_TO(op_in, quantize_op);
   IR_NODE_LINK_TO(quantize_op, quantize_out_node);
   IR_NODE_LINK_TO(quantize_out_node, op);
-  (*quantize_counter)++;
+  quantize_counter++;
 }
 
-void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) {
+void AddQuantizes(Graph* g, ir::Node* op, int& quantize_counter) {
   auto inputs = op->inputs;
   PADDLE_ENFORCE_GE(inputs.size(), 1,
                     platform::errors::InvalidArgument(
@@ -127,7 +127,7 @@ void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) {
     IR_NODE_LINK_TO(inputs[i], quantize_op);
     IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]);
     IR_NODE_LINK_TO(quantize_out_nodes[i], op);
-    (*quantize_counter)++;
+    quantize_counter++;
   }
 
   op->Op()->SetInput("X", quantize_out_node_names);
@@ -136,7 +136,7 @@ void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) {
 // Operators like Concat and Sum have a single input name X, which actually
 // consists of multiple inputs. Such operators require a different way to find
 // pattern and add quantize ops.
-void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int* quantize_counter) {
+void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int& quantize_counter) {
   GraphPatternDetector gpd;
   patterns::DuplicatedInputs duplicated_inputs{gpd.mutable_pattern(),
                                                "duplicated_inputs"};
@@ -151,7 +151,7 @@ void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int* quantize_counter) {
 
 // Adding quantize ops before all operators except Concat and Sum, which have
 // already been handled in AddReoderBeforeDuplicatedInputs
-void AddReoderBeforeSingleInputs(ir::Graph* graph, int* quantize_counter) {
+void AddReoderBeforeSingleInputs(ir::Graph* graph, int& quantize_counter) {
   GraphPatternDetector gpd;
   patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
                                           "first_bfloat16_ops"};
@@ -169,60 +169,134 @@ void AddReoderBeforeSingleInputs(ir::Graph* graph, int* quantize_counter) {
 
 void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const {
   int quantize_counter = 0;
-  AddReoderBeforeDuplicatedInputs(graph, &quantize_counter);
-  AddReoderBeforeSingleInputs(graph, &quantize_counter);
+  AddReoderBeforeDuplicatedInputs(graph, quantize_counter);
+  AddReoderBeforeSingleInputs(graph, quantize_counter);
   PrettyLogDetail("---    added %d quantize ops before bfloat16 op",
                   quantize_counter);
 }
 
-void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
+void AddDequantize(Graph* g, ir::Node* op, ir::Node* op_out,
+                   int& dequantize_counter) {
+  if (op->Op()->Type() == "prior_box") return;
+
+  // Find the name of the output linking op to op_out
+  std::vector<std::string> output_names;
+  for (auto name : op->Op()->OutputNames())
+    for (auto output_name : op->Op()->Output(name))
+      if (output_name == op_out->Name() && IsPermittedOutputName(name))
+        output_names.push_back(name);
+
+  if (output_names.empty()) return;
+
+  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize");
+  deq_desc.SetInput("Input",
+                    std::vector<std::string>({dequantize_in_node->Name()}));
+  deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
+  deq_desc.SetAttr("Scale", 1.0f);
+  deq_desc.SetAttr("Shift", 0.0f);
+  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+
+  for (auto name = output_names.begin(); name < output_names.end(); name++)
+    op->Op()->SetOutput(*name,
+                        std::vector<std::string>({dequantize_in_node->Name()}));
+
+  UnlinkNodes(op, op_out);
+  IR_NODE_LINK_TO(op, dequantize_in_node);
+  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+  IR_NODE_LINK_TO(dequantize_op, op_out);
+
+  dequantize_counter++;
+}
+
+void AddDequantizes(Graph* g, ir::Node* op, int& dequantize_counter) {
+  auto outputs = op->outputs;
+  PADDLE_ENFORCE_GE(outputs.size(), 1,
+                    platform::errors::InvalidArgument(
+                        "OP(%s)'s outputs(%d) must be equal or greater than 1.",
+                        op->Name(), outputs.size()));
+  PADDLE_ENFORCE_EQ(op->inputs.size(), 1,
+                    platform::errors::InvalidArgument(
+                        "OP(%s)'s inputs(%d) must be equal to 1.", op->Name(),
+                        op->inputs.size()));
+
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize");
+
+  std::vector<Node*> dequantize_in_nodes(outputs.size());
+  std::vector<std::string> dequantize_in_node_names(outputs.size());
+
+  for (size_t i = 0; i < outputs.size(); i++) {
+    VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+    dequantize_in_nodes[i] = g->CreateVarNode(&dequantize_in_desc);
+    dequantize_in_node_names[i] = dequantize_in_nodes[i]->Name();
+
+    deq_desc.SetInput("Input",
+                      std::vector<std::string>({dequantize_in_node_names[i]}));
+    deq_desc.SetOutput("Output",
+                       std::vector<std::string>({outputs[i]->Name()}));
+
+    deq_desc.SetAttr("Scale", 1.f);
+    deq_desc.SetAttr("Shift", 0.0f);
+    deq_desc.SetAttr("bfloat16", true);
+    deq_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
+                                          ? op->Op()->GetAttr("data_layout")
+                                          : std::string("NCHW"));
+    auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+
+    UnlinkNodes(op, outputs[i]);
+    IR_NODE_LINK_TO(op, dequantize_in_nodes[i]);
+    IR_NODE_LINK_TO(dequantize_in_nodes[i], dequantize_op);
+    IR_NODE_LINK_TO(dequantize_op, outputs[i]);
+
+    dequantize_counter++;
+  }
+
+  op->Op()->SetOutput("Out", dequantize_in_node_names);
+}
+
+// Operators like split have a single output name Out, which actually
+// consists of multiple outputs. Such operators require a different way to find
+// pattern and add dequantize ops.
+void AddReoderAfterDuplicatedOutputs(ir::Graph* graph,
+                                     int& dequantize_counter) {
+  GraphPatternDetector gpd;
+  patterns::DuplicatedOutputs duplicated_outputs{gpd.mutable_pattern(),
+                                                 "duplicated_outputs"};
+  duplicated_outputs();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_outputs);
+    AddDequantizes(g, op, dequantize_counter);
+  };
+  gpd(graph, handler);
+}
+
+// Adding dequantize ops after all operators except split, which has
+// already been handled in AddReoderAfterDuplicatedOutputs
+void AddReoderAfterSingleOutputs(ir::Graph* graph, int& dequantize_counter) {
   GraphPatternDetector gpd;
   patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
                                          "last_bfloat16_ops"};
   bfloat16_ops();
-  int dequantize_counter = 0;
-
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
     GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops);
-
-    if (op->Op()->Type() != "prior_box") {
-      // Find the name of the output linking op to op_out
-      std::vector<std::string> output_names;
-      for (auto name : op->Op()->OutputNames())
-        for (auto output_name : op->Op()->Output(name))
-          if (output_name == op_out->Name() && IsPermittedOutputName(name))
-            output_names.push_back(name);
-
-      if (output_names.empty()) return;
-
-      VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
-      auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
-
-      OpDesc deq_desc;
-      deq_desc.SetType("dequantize");
-      deq_desc.SetInput("Input",
-                        std::vector<std::string>({dequantize_in_node->Name()}));
-      deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
-      deq_desc.SetAttr("Scale", 1.0f);
-      deq_desc.SetAttr("Shift", 0.0f);
-      auto dequantize_op =
-          g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-
-      for (auto name = output_names.begin(); name < output_names.end(); name++)
-        op->Op()->SetOutput(
-            *name, std::vector<std::string>({dequantize_in_node->Name()}));
-
-      UnlinkNodes(op, op_out);
-      IR_NODE_LINK_TO(op, dequantize_in_node);
-      IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
-      IR_NODE_LINK_TO(dequantize_op, op_out);
-
-      dequantize_counter++;
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+    if (op->Op()->Type() != "split") {
+      AddDequantize(g, op, op_out, dequantize_counter);
     }
   };
   gpd(graph, handler);
+}
+
+void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
+  int dequantize_counter = 0;
+  AddReoderAfterDuplicatedOutputs(graph, dequantize_counter);
+  AddReoderAfterSingleOutputs(graph, dequantize_counter);
   PrettyLogDetail("---    added %d dequantize ops after bfloat16 op",
                   dequantize_counter);
 }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
index f620b4c94fe8906ac957aff041137d73832315da..877ee71fc2d85dd6ac7bcf4c2e41cc92e3e2ef2d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
@@ -45,7 +45,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
     op->SetAttr("mkldnn_data_type", mkldnn_data_type);
-  } else if (type == "concat" || type == "sum") {
+  } else if (type == "concat" || type == "sum" || type == "split") {
     op->SetInput("X", inputs);
     op->SetOutput("Out", outputs);
     op->SetAttr("mkldnn_data_type", mkldnn_data_type);
@@ -117,6 +117,7 @@ TEST(CpuBfloat16Pass, convolution) {
   bool use_mkldnn = true;
   int quant_op = 3;
   int dequant_op = 3;
+  // each added op consists of 2 nodes
   int added_nodes = quant_op * 2 + dequant_op * 2;
   MainTest(BuildProgramDescConv(use_mkldnn), quant_op, dequant_op, added_nodes);
 }
@@ -140,6 +141,7 @@ TEST(CpuBfloat16Pass, double_input_ops) {
   bool use_mkldnn = true;
   int quant_op = 4;
   int dequant_op = 3;
+  // each added op consists of 2 nodes
   int added_nodes = quant_op * 2 + dequant_op * 2;
   MainTest(BuildProgramDescDoubleInput(use_mkldnn), quant_op, dequant_op,
            added_nodes);
@@ -164,11 +166,35 @@ TEST(CpuBfloat16Pass, duplicated_input_ops) {
   bool use_mkldnn = true;
   int quant_op = 5;
   int dequant_op = 3;
+  // each added op consists of 2 nodes
   int added_nodes = quant_op * 2 + dequant_op * 2;
   MainTest(BuildProgramDescDuplicatedInput(use_mkldnn), quant_op, dequant_op,
            added_nodes);
 }
 
+ProgramDesc BuildProgramDescDuplicatedOutput(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, use_mkldnn, "float32");
+  SetOp(&prog, "split", "Split", {"b"}, {"c", "d"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "transpose2", "Transpose", {"c"}, {"e"}, use_mkldnn, "float32");
+  SetOp(&prog, "reshape2", "Reshape", {"d"}, {"f"}, use_mkldnn, "bfloat16");
+
+  return prog;
+}
+
+TEST(CpuBfloat16Pass, duplicated_output_ops) {
+  bool use_mkldnn = true;
+  int quant_op = 2;
+  int dequant_op = 3;
+  // each added op consists of 2 nodes
+  int added_nodes = quant_op * 2 + dequant_op * 2;
+  MainTest(BuildProgramDescDuplicatedOutput(use_mkldnn), quant_op, dequant_op,
+           added_nodes);
+}
+
 ProgramDesc BuildProgramDescDoubleOutputs(bool use_mkldnn) {
   ProgramDesc prog;
   for (auto& v : variable_names) {
@@ -190,6 +216,7 @@ TEST(CpuBfloat16Pass, double_outputs_ops) {
   bool use_mkldnn = true;
   int quant_op = 3;
   int dequant_op = 3;
+  // each added op consists of 2 nodes
   int added_nodes = quant_op * 2 + dequant_op * 2;
   MainTest(BuildProgramDescDoubleOutputs(use_mkldnn), quant_op, dequant_op,
            added_nodes);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 3a78c229bd8fa83ff4c4d96ff270f20f131ab52b..889417b78c8641060b8ad89219749d8400558c6a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -126,7 +126,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
   auto x = scope->Var(var_name);
   auto tensor = x->GetMutable<LoDTensor>();
   tensor->mutable_data(place,
-                       framework::TransToPtenDataType(proto::VarType::FP32), 1);
+                       framework::TransToPhiDataType(proto::VarType::FP32), 1);
 }
 
 void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index e00bb84e35c09eb987b2470c041545cf7f53e4ea..0506bfaf447ac68368d7d8f2a87014a6234c444c 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -526,7 +526,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
   auto x = scope->Var(var_name);
   auto tensor = x->GetMutable<LoDTensor>();
   tensor->mutable_data(place,
-                       framework::TransToPtenDataType(proto::VarType::FP32), 1);
+                       framework::TransToPhiDataType(proto::VarType::FP32), 1);
 }
 
 void PrepareGraph(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog) {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index ea335e9bd63c624310df2f092b13e30a9458bb93..0a95444f852dd0abdd150d04dc7536e26151c218 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
index ca42a613411ba6078b00522d2c178856993fa462..d6761d2e82ef300264d9f2bd35b6441de2e00a67 100644
--- a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
@@ -428,6 +428,19 @@ PrelnEmbeddingEltwiseLayerNormFusePass::
 
 void PrelnEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
+
+  bool enable_int8 = Get<bool>("enable_int8");
+  bool use_oss = Get<bool>("use_oss");
+  bool with_interleaved = Get<bool>("with_interleaved");
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) {
+    VLOG(4) << "preln_embedding_eltwise_layernorm_fuse_pass need: use_trt, "
+               "enable_int8, "
+               "use_oss, with_interleaved, with_dynamic_shape. Stop this pass, "
+               "please reconfig.";
+    return;
+  }
+
   int fusion_count =
       PrelnEmbeddingEltwiseLayerNormFusePass::BuildFusion(graph, name_scope_);
   if (fusion_count > 0) {
diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
index 1b7b82cbca9e86587467fa0888eca6c6fdc2e162..978360d8f0a95b545b1460620d81eec8642977c2 100644
--- a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
@@ -39,7 +39,6 @@ struct PrelnSkipLayerNorm : public PatternBase {
   void operator()(PDNode *x, PDNode *y);
 
   // declare operator node's name
-  PATTERN_DECL_NODE(fused_skipe_layernorm);
   PATTERN_DECL_NODE(elementwise);
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
@@ -62,8 +61,13 @@ void PrelnSkipLayerNorm::operator()(PDNode *x, PDNode *y) {
   auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr())
                                   ->assert_is_op_output("elementwise_add")
                                   ->assert_is_op_input("layer_norm", "X")
-                                  ->assert_is_op_input("elementwise_add", "Y");
-
+                                  ->assert_more([](Node *x) {
+                                    if (x->outputs.size() == 2) {
+                                      return true;
+                                    } else {
+                                      return false;
+                                    }
+                                  });
   // Add links for elementwise_add op.
   elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var});
 
@@ -104,6 +108,18 @@ void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::PreconditionNotMet("graph should not be null."));
   FusePassBase::Init("preln_skip_layernorm_fuse", graph);
+  bool enable_int8 = Get<bool>("enable_int8");
+  bool use_oss = Get<bool>("use_oss");
+  bool with_interleaved = Get<bool>("with_interleaved");
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) {
+    VLOG(4) << "preln_skip_layernorm_fuse_pass need: use_trt, enable_int8, "
+               "use_oss, "
+               "with_interleaved, with_dynamic_shape. Stop this pass, please "
+               "reconfig. ";
+    return;
+  }
+
   int found_subgraph_count = 0;
 
   GraphPatternDetector gpd;
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index db194d59d37bafc78cc7da50a664a6788a657a88..bfa14d9296b26e08f56e8ab2f30542524b786cf9 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -39,7 +39,6 @@ struct SkipLayerNorm : public PatternBase {
   PDNode *operator()(PDNode *x, PDNode *y);
 
   // declare operator node's name
-  PATTERN_DECL_NODE(fused_skipe_layernorm);
   PATTERN_DECL_NODE(elementwise);
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
@@ -59,9 +58,10 @@ PDNode *SkipLayerNorm::operator()(PDNode *x, PDNode *y) {
   y->assert_is_op_input("elementwise_add", "Y");
   auto *elementwise =
       pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
-  auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr())
-                                  ->AsOutput()
-                                  ->assert_is_op_output("elementwise_add");
+  auto *elementwise_out_var =
+      pattern->NewNode(elementwise_out_repr())
+          ->AsOutput()
+          ->assert_is_only_output_of_op("elementwise_add");
 
   // Add links for elementwise_add op.
   elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var});
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index a9e0b9c98b46f39b98a6bdce1fc12bbc3321ef00..56f9e6842373b3eba7d2d71b84adbf17ad291254 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -447,7 +447,7 @@ void MergeLoDTensor(LoDTensor *target,
   target->set_layout(new_layout);
   target->set_lod(new_lod);
   target->mutable_data(dst_place,
-                       paddle::framework::TransToPtenDataType(new_type));
+                       paddle::framework::TransToPhiDataType(new_type));
 
   int begin = 0;
   for (auto *src : lod_tensors) {
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index ddda7231887edfc78fa7b1b6adc5cd8324e5b894..006485a698fb3dc93188cd46450ea108e709ff6d 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -31,15 +31,17 @@ TEST(LoD, data) {
   lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
 
   auto& v = lod[0];
+  paddle::framework::MixVector<size_t> mix_vector_v(&v);
   paddle::platform::CUDAPlace gpu(0);
 #ifdef PADDLE_WITH_HIP
-  hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, v.CUDAMutableData(gpu),
-                     v.size());
+  hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0,
+                     mix_vector_v.CUDAMutableData(gpu), v.size());
   hipDeviceSynchronize();
 #else
-  test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
+  test<<<1, 1>>>(mix_vector_v.CUDAMutableData(gpu), v.size());
   cudaDeviceSynchronize();
 #endif
+  mix_vector_v.CopyToCPU();
   for (size_t i = 0; i < v.size(); ++i) {
     EXPECT_EQ(v[i], i * 2);
   }
@@ -62,15 +64,17 @@ TEST(LoDTensor, LoDInGPU) {
   EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
 
   auto lod = lod_tensor.lod();
+  paddle::framework::MixVector<size_t> mix_vector(&(lod[0]));
 
 #ifdef PADDLE_WITH_HIP
   hipLaunchKernelGGL(test, dim3(1), dim3(8), 0, 0,
-                     lod[0].CUDAMutableData(place), lod[0].size());
+                     mix_vector.CUDAMutableData(place), lod[0].size());
   hipDeviceSynchronize();
 #else
-  test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
+  test<<<1, 8>>>(mix_vector.CUDAMutableData(place), lod[0].size());
   cudaDeviceSynchronize();
 #endif
+  mix_vector.CopyToCPU();
 
   for (size_t i = 0; i < src_lod[0].size(); ++i) {
     EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc
index b15a66c51c4b6365cb4285894efb1e37a03b7b64..67b2d70f3440c5254abb5ff67995e6758af5c8f1 100644
--- a/paddle/fluid/framework/mixed_vector.cc
+++ b/paddle/fluid/framework/mixed_vector.cc
@@ -64,19 +64,20 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
   auto stream = dev_ctx->stream();
   paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst,
                        platform::CPUPlace(), src, *gpu_memory_size_, stream);
+  dev_ctx->Wait();
 #endif
 }
 
-#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                  \
-  template <>                                                                  \
-  void Vector<__TYPE__>::VectorData::CopyToCPU() const {                       \
-    CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_);                \
-  }                                                                            \
-                                                                               \
-  template <>                                                                  \
-  void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                        \
-      const platform::Place &place) const {                                    \
-    CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \
+#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                 \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyToCPU() const {                   \
+    CopyToCPUHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_);                \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                    \
+      const platform::Place &place) const {                                   \
+    CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \
   }
 
 INSTANTIATE_VECTOR_FOR_TYPE(size_t)
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 0fd67efc177b3d6bd83b1c9d8325d0de81c0d2e5..a589a5b4ea7e15fc24f443e8062635b1e337adfe 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/utils/none.h"
 #include "paddle/utils/optional.h"
@@ -30,6 +29,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <class T>
+using Vector = std::vector<T>;
+
 inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
     const paddle::memory::allocation::AllocationPtr &gpu_) {
   return gpu_ == nullptr ? paddle::none
@@ -39,7 +41,7 @@ inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
-class Vector {
+class MixVector {
  public:
   using value_type = T;
   using iterator = typename std::vector<T>::iterator;
@@ -49,82 +51,68 @@ class Vector {
   // The actual class to implement vector logic
   class VectorData {
    public:
-    VectorData() : flag_(kDataInCPU) {}
-    VectorData(size_t count, const T &value)
-        : cpu_(count, value), flag_(kDataInCPU) {}
-    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
     template <typename U>
-    explicit VectorData(const std::vector<U> &dat)
-        : cpu_(dat), flag_(kDataInCPU) {}
+    explicit VectorData(std::vector<U> *dat) : cpu_(dat), flag_(kDataInCPU) {}
     ~VectorData() {}
 
-    VectorData(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-    }
+    VectorData(const VectorData &o) = delete;
 
-    VectorData &operator=(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-      return *this;
-    }
+    VectorData &operator=(const VectorData &o) = delete;
 
     T &operator[](size_t i) {
       MutableCPU();
-      return cpu_[i];
+      return (*cpu_)[i];
     }
 
     const T &operator[](size_t i) const {
       ImmutableCPU();
-      return cpu_[i];
+      return (*cpu_)[i];
     }
 
-    size_t size() const { return cpu_.size(); }
+    size_t size() const { return (*cpu_).size(); }
 
     iterator begin() {
       MutableCPU();
-      return cpu_.begin();
+      return (*cpu_).begin();
     }
 
     iterator end() {
       MutableCPU();
-      return cpu_.end();
+      return (*cpu_).end();
     }
 
     T &front() {
       MutableCPU();
-      return cpu_.front();
+      return (*cpu_).front();
     }
 
     T &back() {
       MutableCPU();
-      return cpu_.back();
+      return (*cpu_).back();
     }
 
     const_iterator begin() const {
       ImmutableCPU();
-      return cpu_.begin();
+      return (*cpu_).begin();
     }
 
     const_iterator end() const {
       ImmutableCPU();
-      return cpu_.end();
+      return (*cpu_).end();
     }
 
     const T &back() const {
       ImmutableCPU();
-      return cpu_.back();
+      return (*cpu_).back();
     }
 
-    T *data() { return &(*this)[0]; }
+    T *data() { return cpu_->data(); }
 
-    const T *data() const { return &(*this)[0]; }
+    const T *data() const { return cpu_->data(); }
 
     const T &front() const {
       ImmutableCPU();
-      return cpu_.front();
+      return (*cpu_).front();
     }
 
     // assign this from iterator.
@@ -132,14 +120,14 @@ class Vector {
     template <typename Iter>
     void assign(Iter begin, Iter end) {
       MutableCPU();
-      cpu_.assign(begin, end);
+      (*cpu_).assign(begin, end);
     }
 
     // push_back. If the previous capacity is not enough, the memory will
     // double.
     void push_back(T elem) {
       MutableCPU();
-      cpu_.push_back(elem);
+      (*cpu_).push_back(elem);
     }
 
     // extend a vector by iterator.
@@ -147,14 +135,14 @@ class Vector {
     template <typename It>
     void Extend(It begin, It end) {
       MutableCPU();
-      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
+      auto out_it = std::back_inserter<std::vector<T>>(*(this->cpu_));
       std::copy(begin, end, out_it);
     }
 
     // resize the vector
     void resize(size_t size) {
       MutableCPU();
-      cpu_.resize(size);
+      (*cpu_).resize(size);
     }
 
     // get cuda ptr. immutable
@@ -176,26 +164,16 @@ class Vector {
 
     // clear
     void clear() {
-      cpu_.clear();
+      (*cpu_).clear();
       flag_ = kDirty | kDataInCPU;
     }
 
-    size_t capacity() const { return cpu_.capacity(); }
-
-    // reserve data
-    void reserve(size_t size) const { cpu_.reserve(size); }
+    std::vector<T> *get_vector() { return cpu_; }
 
-    // implicit cast operator. Vector can be cast to std::vector implicitly.
-    operator std::vector<T>() const {
-      ImmutableCPU();
-      return cpu_;
-    }
+    size_t capacity() const { return (*cpu_).capacity(); }
 
-    bool operator==(const VectorData &other) const {
-      ImmutableCPU();
-      other.ImmutableCPU();
-      return cpu_ == other.cpu_;
-    }
+    // reserve data
+    void reserve(size_t size) const { (*cpu_).reserve(size); }
 
     std::mutex &Mutex() const { return mtx_; }
 
@@ -203,6 +181,13 @@ class Vector {
       return OptionalCUDAPlace(gpu_);
     }
 
+    void MutableCPU() {
+      if (IsInCUDA() && IsDirty()) {
+        CopyToCPU();
+      }
+      flag_ = kDirty | kDataInCPU;
+    }
+
    private:
     enum DataFlag {
       kDataInCPU = 0x01,
@@ -213,13 +198,6 @@ class Vector {
 
     void CopyToCPU() const;
 
-    void MutableCPU() {
-      if (IsInCUDA() && IsDirty()) {
-        CopyToCPU();
-      }
-      flag_ = kDirty | kDataInCPU;
-    }
-
     void ImmutableCUDA(platform::Place place) const {
       if (IsDirty()) {
         if (IsInCPU()) {
@@ -269,7 +247,7 @@ class Vector {
 
     bool IsInCPU() const { return flag_ & kDataInCPU; }
 
-    mutable std::vector<T> cpu_;
+    std::vector<T> *cpu_;
     mutable paddle::memory::allocation::AllocationPtr gpu_;
     mutable size_t gpu_memory_size_{0};
     mutable int flag_;
@@ -278,89 +256,77 @@ class Vector {
   };
 
  public:
-  // Default ctor. Create empty Vector
-  Vector() : m_(new VectorData()) {}
-
-  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T &value = T())
-      : m_(new VectorData(count, value)) {}
-
-  // Ctor with init_list
-  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
-
   // implicit cast from std::vector.
   template <typename U>
-  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
+  MixVector(const std::vector<U> *dat) {  // NOLINT
+    m_.reset(new VectorData(const_cast<std::vector<U> *>(dat)));
   }
 
   // Copy ctor
-  Vector(const Vector<T> &other) { m_ = other.m_; }
+  MixVector(const MixVector<T> &other) = delete;
 
   // Copy operator
-  Vector<T> &operator=(const Vector<T> &other) {
-    m_ = other.m_;
-    return *this;
-  }
+  MixVector<T> &operator=(const MixVector<T> &other) = delete;
 
   // Move ctor
-  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
+  MixVector(MixVector<T> &&other) = delete;
 
   // CPU data access method. Mutable.
-  T &operator[](size_t i) { return (*m_.MutableData())[i]; }
+  T &operator[](size_t i) { return (*m_)[i]; }
 
   // CPU data access method. Immutable.
-  const T &operator[](size_t i) const { return m_.Data()[i]; }
+  const T &operator[](size_t i) const { return (*m_)[i]; }
 
   // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return m_.Data().size(); }
+  size_t size() const { return m_->size(); }
 
-  iterator begin() { return m_.MutableData()->begin(); }
+  iterator begin() { return m_->begin(); }
 
-  iterator end() { return m_.MutableData()->end(); }
+  iterator end() { return m_->end(); }
 
-  T &front() { return m_.MutableData()->front(); }
+  T &front() { return m_->front(); }
 
-  T &back() { return m_.MutableData()->back(); }
+  T &back() { return m_->back(); }
 
-  const_iterator begin() const { return m_.Data().begin(); }
+  const_iterator begin() const { return m_->begin(); }
 
-  const_iterator end() const { return m_.Data().end(); }
+  const_iterator end() const { return m_->end(); }
 
   const_iterator cbegin() const { return begin(); }
 
   const_iterator cend() const { return end(); }
 
-  const T &back() const { return m_.Data().back(); }
+  const T &back() const { return m_->back(); }
 
-  T *data() { return m_.MutableData()->data(); }
+  T *data() { return m_->data(); }
 
-  const T *data() const { return m_.Data().data(); }
+  const T *data() const { return m_->data(); }
 
-  const T &front() const { return m_.Data().front(); }
+  const T &front() const { return m_->front(); }
   // end of std::vector iterator methods
 
   // assign this from iterator.
   // NOTE: the iterator must support `end-begin`
   template <typename Iter>
   void assign(Iter begin, Iter end) {
-    m_.MutableData()->assign(begin, end);
+    m_->assign(begin, end);
   }
 
   // push_back. If the previous capacity is not enough, the memory will
   // double.
-  void push_back(T elem) { m_.MutableData()->push_back(elem); }
+  void push_back(T elem) { m_->push_back(elem); }
 
   // extend a vector by iterator.
   // NOTE: the iterator must support end-begin
   template <typename It>
   void Extend(It begin, It end) {
-    m_.MutableData()->Extend(begin, end);
+    m_->Extend(begin, end);
   }
 
   // resize the vector
   void resize(size_t size) {
-    if (m_.Data().size() != size) {
-      m_.MutableData()->resize(size);
+    if (m_->size() != size) {
+      m_->resize(size);
     }
   }
 
@@ -368,15 +334,15 @@ class Vector {
   const T *CUDAData(platform::Place place) const {
     {
       platform::CUDAPlace p(place.GetDeviceId());
-      auto &mtx = m_.Data().Mutex();
+      auto &mtx = m_->Mutex();
       std::lock_guard<std::mutex> guard(mtx);
-      auto cuda_place = m_.Data().CUDAPlace();
+      auto cuda_place = m_->CUDAPlace();
       if (cuda_place == paddle::none || cuda_place == p) {
-        return m_.Data().CUDAData(place);
+        return m_->CUDAData(place);
       }
     }
-    // If m_ contains CUDAData in a different place. Detach manually.
-    m_.Detach();
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
     return CUDAData(place);
   }
 
@@ -384,25 +350,25 @@ class Vector {
   T *CUDAMutableData(platform::Place place) {
     {
       platform::CUDAPlace p(place.GetDeviceId());
-      auto &mtx = m_.Data().Mutex();
+      auto &mtx = m_->Mutex();
       std::lock_guard<std::mutex> guard(mtx);
-      auto cuda_place = m_.Data().CUDAPlace();
+      auto cuda_place = m_->CUDAPlace();
       if (cuda_place == paddle::none || cuda_place == p) {
-        return m_.MutableData()->CUDAMutableData(place);
+        return m_->CUDAMutableData(place);
       }
     }
-    // If m_ contains CUDAData in a different place. Detach manually.
-    m_.Detach();
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
     return CUDAMutableData(place);
   }
 
   // clear
-  void clear() { m_.MutableData()->clear(); }
+  void clear() { m_->clear(); }
 
-  size_t capacity() const { return m_.Data().capacity(); }
+  size_t capacity() const { return m_->capacity(); }
 
   // reserve data
-  void reserve(size_t size) { m_.Data().reserve(size); }
+  void reserve(size_t size) { m_->reserve(size); }
 
   // the unify method to access CPU or CUDA data. immutable.
   const T *Data(platform::Place place) const {
@@ -422,26 +388,12 @@ class Vector {
     }
   }
 
-  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const { return m_.Data(); }
-
-  bool operator==(const Vector<T> &other) const {
-    if (size() != other.size()) return false;
-    auto it1 = cbegin();
-    auto it2 = other.cbegin();
-    for (; it1 < cend(); ++it1, ++it2) {
-      if (*it1 != *it2) {
-        return false;
-      }
-    }
-    return true;
-  }
+  void CopyToCPU() { m_->MutableCPU(); }
 
-  const void *Handle() const { return &m_.Data(); }
+  const void *Handle() const { return m_.get(); }
 
  private:
-  // Vector is an COW object.
-  mutable details::COWPtr<VectorData> m_;
+  mutable std::unique_ptr<VectorData> m_;
 };
 
 };  // namespace framework
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 011e2729d4adffd49c65f536f2ebb33d9a949e56..4cd9aab2896b6fc5940af38cde35945d007aec64 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -28,7 +28,7 @@
 #include "paddle/fluid/platform/device_context.h"
 
 template <typename T>
-using vec = paddle::framework::Vector<T>;
+using vec = paddle::framework::MixVector<T>;
 using gpuStream_t = paddle::gpuStream_t;
 
 static __global__ void multiply_10(int* ptr) {
@@ -44,10 +44,11 @@ gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
 }
 
 TEST(mixed_vector, GPU_VECTOR) {
-  vec<int> tmp;
+  std::vector<int> x;
   for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
+    x.push_back(i);
   }
+  vec<int> tmp(&x);
   ASSERT_EQ(tmp.size(), 10UL);
   paddle::platform::CUDAPlace gpu(0);
 
@@ -70,10 +71,11 @@ TEST(mixed_vector, MultiGPU) {
     return;
   }
 
-  vec<int> tmp;
+  std::vector<int> x;
   for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
+    x.push_back(i);
   }
+  vec<int> tmp(&x);
   ASSERT_EQ(tmp.size(), 10UL);
   paddle::platform::CUDAPlace gpu0(0);
   paddle::platform::SetDeviceId(0);
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 766a3b9e495d521db3d628d170fb13fa32bdebb2..878b845211ca1ae9e92f43fcc6ac82da366264d4 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -389,7 +389,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
   {
     platform::RecordEvent infershape_event(
-        "InferShape", platform::TracerEventType::OperatorInner, 1,
+        "infer_shape", platform::TracerEventType::OperatorInner, 1,
         platform::EventRole::kInnerOp);
     // If it is OperatorBase, InferShape do nothing.
     if (op_with_kernel != nullptr)
@@ -411,23 +411,23 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   }
   {
     platform::RecordEvent compute_event(
-        "Compute", platform::TracerEventType::OperatorInner, 1,
+        "compute", platform::TracerEventType::OperatorInner, 1,
         platform::EventRole::kInnerOp);
     if (op_with_kernel == nullptr) {
       instr_node.OpBase()->Run(*local_scope, place_);
     } else {
-      // fit for pten
-      if (instr_node.PtenKernel() && instr_node.PtenKernel()->IsValid()) {
-        VLOG(4) << "Run pten kernel: " << op->Type();
+      // fit for phi
+      if (instr_node.PhiKernel() && instr_node.PhiKernel()->IsValid()) {
+        VLOG(4) << "Run phi kernel: " << op->Type();
         VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                 << &instr_node.DeviceContext();
         phi::KernelContext pt_kernel_context;
-        op_with_kernel->BuildPtenKernelContext(
+        op_with_kernel->BuildPhiKernelContext(
             *instr_node.InnerRuntimeContext().get(),
             const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
             &pt_kernel_context);
 
-        (*instr_node.PtenKernel())(&pt_kernel_context);
+        (*instr_node.PhiKernel())(&pt_kernel_context);
 
       } else {
         instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
@@ -561,7 +561,8 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
             << " runs on " << platform::GetCurrentThreadName();
 
     auto* op = instr_node.OpBase();
-    platform::RecordEvent instruction_event(op->Type().c_str());
+    platform::RecordEvent instruction_event(
+        op->Type(), platform::TracerEventType::Operator, 1);
     interpreter::WaitEvent(instr_node, place_);
 
     try {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 0767dde4392b89d57539ad697f5b64d2090b0fcd..d595af58257d4f6e0f6bd1fd009ab78e181f96f7 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -407,14 +407,14 @@ void build_op_func_list(const platform::Place& place,
       auto exec_ctx =
           ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
 
-      auto run_pten_kernel = false;
-      if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(
+      auto run_phi_kernel = false;
+      if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
               op_with_kernel->Type())) {
-        auto pt_kernel_key = op_with_kernel->ChoosePtenKernel(exec_ctx);
-        auto pt_kernel_name = op_with_kernel->PtenKernelSignature()->name;
+        auto pt_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx);
+        auto pt_kernel_name = op_with_kernel->PhiKernelSignature()->name;
 
-        if (op_with_kernel->PtenKernel()->IsValid()) {
-          run_pten_kernel = true;
+        if (op_with_kernel->PhiKernel()->IsValid()) {
+          run_phi_kernel = true;
         } else {
           auto kernels_iter = all_op_kernels.find(op_with_kernel->Type());
           if (kernels_iter == all_op_kernels.end() ||
@@ -422,26 +422,26 @@ void build_op_func_list(const platform::Place& place,
                   kernels_iter->second.end()) {
             auto pt_cpu_kernel_key = FallBackToCpu(
                 expected_kernel_key, pt_kernel_key, *op_with_kernel);
-            op_with_kernel->ResetPtenKernel(
+            op_with_kernel->ResetPhiKernel(
                 new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
                     pt_kernel_name, pt_cpu_kernel_key)));
-            if (op_with_kernel->PtenKernel()->IsValid()) {
+            if (op_with_kernel->PhiKernel()->IsValid()) {
               VLOG(6) << "Static mode PrepareImpl - kernel name: "
                       << pt_kernel_name
                       << " | kernel key: " << pt_cpu_kernel_key
-                      << " | kernel: " << *(op_with_kernel->PtenKernel());
-              run_pten_kernel = true;
+                      << " | kernel: " << *(op_with_kernel->PhiKernel());
+              run_phi_kernel = true;
             }
           }
         }
       }
       VLOG(3) << op_with_kernel->Type()
               << " : expected_kernel_key : " << expected_kernel_key;
-      if (run_pten_kernel) {
+      if (run_phi_kernel) {
         phi::KernelContext pt_kernel_context;
-        op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx,
-                                               &pt_kernel_context);
-        op_func_node.pt_kernel_ = op_with_kernel->PtenKernel();
+        op_with_kernel->BuildPhiKernelContext(runtime_context, dev_ctx,
+                                              &pt_kernel_context);
+        op_func_node.pt_kernel_ = op_with_kernel->PhiKernel();
 
         (*op_func_node.pt_kernel_)(&pt_kernel_context);
       } else {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 1fbe4500ac6dff261cc38e33ad90bfd92b83ad39..35bac4393170331486298a29f1b6be26065ad864 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -688,9 +688,7 @@ OpKernelComputeFunc Instruction::KernelFunc() const {
   return op_func_node_.kernel_func_;
 }
 
-phi::Kernel* Instruction::PtenKernel() const {
-  return op_func_node_.pt_kernel_;
-}
+phi::Kernel* Instruction::PhiKernel() const { return op_func_node_.pt_kernel_; }
 
 OpFuncType Instruction::KernelType() const { return op_func_node_.type_; }
 
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 93b9aee4f32cbfa88c0a79d4018b3a2ca03cf035..dc34bd2c69411837b6130b87dba1753687cf82f8 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -300,7 +300,7 @@ struct OpFuncNode {
   OpKernelComputeFunc kernel_func_;
   platform::DeviceContext* dev_ctx_;  // not owned
 
-  // fit for pten kernel
+  // fit for phi kernel
   phi::Kernel* pt_kernel_{nullptr};  // not owned
 
   OpFuncType type_;
@@ -321,7 +321,7 @@ class Instruction {
 
   OpKernelComputeFunc KernelFunc() const;
 
-  phi::Kernel* PtenKernel() const;
+  phi::Kernel* PhiKernel() const;
 
   OpFuncType KernelType() const;
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
index a4a913cdff22db18e467670be9644ed90dca542e..21b2927b52eab653e20611e135a8c0f905057fcf 100644
--- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
+++ b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
@@ -44,7 +44,6 @@ class ThreadDataRegistry {
   template <typename Alias = T,
             typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
   void SetCurrentThreadData(const T& val) {
-    std::lock_guard<std::mutex> lock(lock_);
     CurrentThreadData() = val;
   }
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
index 07c5298c2f22377e277939e11af6fa6c142f24bc..596ffb9bfc0c4f624aeaf5874bdf18563d96d14c 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -8,6 +8,7 @@
 #include "paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace framework {
@@ -61,6 +62,8 @@ class WorkQueueImpl : public WorkQueue {
   }
 
   void AddTask(std::function<void()> fn) override {
+    platform::RecordEvent("WorkQueue::AddTask",
+                          platform::TracerEventType::UserDefined, 10 /*level*/);
     if (tracker_ != nullptr) {
       fn = [
         task = std::move(fn), raii = CounterGuard<TaskTracker>(tracker_)
@@ -156,6 +159,8 @@ WorkQueueGroupImpl::~WorkQueueGroupImpl() {
 }
 
 void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
+  platform::RecordEvent("WorkQueue::AddTask",
+                        platform::TracerEventType::UserDefined, 10 /*level*/);
   assert(queue_idx < queues_.size());
   if (queues_options_.at(queue_idx).track_task) {
     fn = [
diff --git a/paddle/fluid/framework/op_kernel_info_helper.h b/paddle/fluid/framework/op_kernel_info_helper.h
deleted file mode 100644
index d62711bb882750b93bdd33a5e7d9d1ab44c20c95..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_kernel_info_helper.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/api/ext/op_kernel_info.h"
-#include "paddle/phi/core/kernel_factory.h"
-
-namespace paddle {
-namespace framework {
-
-class OpKernelInfoHelper {
- public:
-  static const std::string& GetOpName(const paddle::OpKernelInfo& info) {
-    return info.op_name_;
-  }
-
-  static const phi::Backend& GetBackend(const paddle::OpKernelInfo& info) {
-    return info.backend_;
-  }
-
-  static const phi::DataLayout& GetDataLayout(
-      const paddle::OpKernelInfo& info) {
-    return info.layout_;
-  }
-
-  static const phi::DataType& GetDataType(const paddle::OpKernelInfo& info) {
-    return info.dtype_;
-  }
-
-  static phi::KernelKey GetKernelKey(const paddle::OpKernelInfo& info) {
-    return phi::KernelKey(info.backend_, info.layout_, info.dtype_);
-  }
-
-  static const CustomKernelFunc& GetKernelFn(const paddle::OpKernelInfo& info) {
-    return info.kernel_fn_;
-  }
-
-  static void* GetVariadicKernelFn(const paddle::OpKernelInfo& info) {
-    return info.variadic_kernel_fn_;
-  }
-
-  static const paddle::SmallVector<TensorArgDef>& GetInputDefs(
-      const paddle::OpKernelInfo& info) {
-    return info.input_defs_;
-  }
-
-  static const paddle::SmallVector<TensorArgDef>& GetOutputDefs(
-      const paddle::OpKernelInfo& info) {
-    return info.output_defs_;
-  }
-
-  static const paddle::SmallVector<AttributeArgDef>& GetAttributeDefs(
-      const paddle::OpKernelInfo& info) {
-    return info.attribute_defs_;
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8e614faa248faccd1385ea29e0cd0950f08c481d..d33791f70c4d2f759bcd4f6443a5a1f244673d4f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_call_stack.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
@@ -263,11 +263,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
       // in order to record different op type cost time
       // and different op name cost time,we set two event.
       platform::RecordEvent op_type_record_event(
-          Type().c_str(), platform::TracerEventType::Operator, 1);
-      auto op_name = platform::OpName(outputs_, Type());
-      platform::RecordEvent op_name_record_event(
-          op_name, platform::TracerEventType::Operator, 1,
-          platform::EventRole::kUniqueOp);
+          Type(), platform::TracerEventType::Operator, 1);
+      // auto op_name = platform::OpName(outputs_, Type());
+      // platform::RecordEvent op_name_record_event(
+      //     op_name, platform::TracerEventType::Operator, 1,
+      //     platform::EventRole::kUniqueOp);
       RunImpl(scope, place);
     }
 
@@ -616,9 +616,9 @@ bool OpSupportGPU(const std::string& op_type) {
   // check in new Function kernel first
   auto& kernel_factory = phi::KernelFactory::Instance();
   auto kernel_key_map =
-      kernel_factory.SelectKernelMap(phi::TransToPtenKernelName(op_type));
+      kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type));
   for (auto& kernel : kernel_key_map) {
-    if (platform::is_gpu_place(phi::TransToPtenPlace(kernel.first.backend()))) {
+    if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) {
       return true;
     }
   }
@@ -1186,10 +1186,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // phase
   phi::KernelKey pt_kernel_key;
   std::string pt_kernel_name;
-  if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(type_)) {
+  if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(type_)) {
     if (pt_kernel_signature_ == nullptr || pt_kernel_ == nullptr) {
       pt_kernel_signature_.reset(
-          new KernelSignature(std::move(GetExpectedPtenKernelArgs(exe_ctx))));
+          new KernelSignature(std::move(GetExpectedPhiKernelArgs(exe_ctx))));
       VLOG(6) << *pt_kernel_signature_.get();
 
       kernel_type_.reset(
@@ -1197,22 +1197,32 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       dev_ctx = pool.Get(kernel_type_->place_);
 
       pt_kernel_name = pt_kernel_signature_->name;
-      pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get());
+      pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
       pt_kernel_.reset(
           new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
               pt_kernel_name, pt_kernel_key)));
 
       if (pt_kernel_->IsValid()) {
-        VLOG(6) << "Static mode ChoosePtenKernel - kernel name: "
+        VLOG(6) << "Static mode ChoosePhiKernel - kernel name: "
                 << pt_kernel_name << " | kernel key: " << pt_kernel_key
                 << " | kernel: " << *pt_kernel_;
       } else {
-        VLOG(6) << "Static mode ChoosePtenKernel - kernel `" << pt_kernel_name
+        VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name
                 << "` not found.";
       }
     }
-    if (pt_kernel_->IsValid()) {
-      run_pten_kernel_ = true;
+#ifdef PADDLE_WITH_XPU
+    bool is_xpu_unsupport =
+        paddle::platform::is_xpu_place(kernel_type_->place_) &&
+            !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) ||
+        paddle::platform::is_in_xpu_black_list(type_);
+#endif
+    if (pt_kernel_->IsValid()
+#ifdef PADDLE_WITH_XPU
+        && !is_xpu_unsupport
+#endif
+        ) {
+      run_phi_kernel_ = true;
     } else {
       auto& all_op_kernels = AllOpKernels();
       auto kernels_iter = all_op_kernels.find(type_);
@@ -1220,13 +1230,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
           kernels_iter->second.find(*kernel_type_.get()) ==
               kernels_iter->second.end()
 #ifdef PADDLE_WITH_XPU
-          ||
-          paddle::platform::is_xpu_place(kernel_type_->place_) &&  // NOLINT
-              !paddle::platform::is_xpu_support_op(
-                  type_, *kernel_type_.get())  // NOLINT
-          || paddle::platform::is_in_xpu_black_list(type_)
+          || is_xpu_unsupport
 #endif
-              ) {
+          ) {
         auto pt_cpu_kernel_key =
             FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this);
         pt_kernel_.reset(
@@ -1238,12 +1244,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
           VLOG(6) << "Static mode PrepareImpl - kernel name: " << pt_kernel_name
                   << " | kernel key: " << pt_cpu_kernel_key
                   << " | kernel: " << *pt_kernel_;
-          run_pten_kernel_ = true;
+          run_phi_kernel_ = true;
         }
       }
     }
   }
-  if (!run_pten_kernel_) {
+  if (!run_phi_kernel_) {
     if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
       ChooseKernel(exe_ctx);
       dev_ctx = pool.Get(kernel_type_->place_);
@@ -1284,13 +1290,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("compute",
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
-    if (run_pten_kernel_) {
+    if (run_phi_kernel_) {
       phi::KernelContext pt_kernel_context;
       // Do data transform before building KernelContext
       // TODO(zhiqiu): support TransferInplaceVarsBack
-      PreparePtenData(exec_scope, *pt_kernel_, *pt_kernel_signature_,
-                      runtime_ctx);
-      BuildPtenKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
+      PreparePhiData(exec_scope, *pt_kernel_, *pt_kernel_signature_,
+                     runtime_ctx);
+      BuildPhiKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
       (*pt_kernel_)(&pt_kernel_context);
     } else {
       (*kernel_func_)(
@@ -1382,26 +1388,26 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
   return expected_kernel_key;
 }
 
-phi::KernelKey OperatorWithKernel::ChoosePtenKernel(
+phi::KernelKey OperatorWithKernel::ChoosePhiKernel(
     const ExecutionContext& ctx) const {
   pt_kernel_signature_.reset(
-      new KernelSignature(std::move(GetExpectedPtenKernelArgs(ctx))));
+      new KernelSignature(std::move(GetExpectedPhiKernelArgs(ctx))));
   VLOG(6) << *pt_kernel_signature_.get();
 
   kernel_type_.reset(
       new OpKernelType(std::move(InnerGetExpectedKernelType(ctx))));
 
   auto pt_kernel_name = pt_kernel_signature_->name;
-  auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get());
+  auto pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
   pt_kernel_.reset(new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
       pt_kernel_name, pt_kernel_key)));
 
   if (pt_kernel_->IsValid()) {
-    VLOG(6) << "Static mode ChoosePtenKernel - kernel name: " << pt_kernel_name
+    VLOG(6) << "Static mode ChoosePhiKernel - kernel name: " << pt_kernel_name
             << " | kernel key: " << pt_kernel_key
             << " | kernel: " << *pt_kernel_;
   } else {
-    VLOG(6) << "Static mode ChoosePtenKernel - kernel `" << pt_kernel_name
+    VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name
             << "` not found.";
   }
   return pt_kernel_key;
@@ -1912,7 +1918,7 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
                       tensor.layout());
 }
 
-KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
+KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
     const ExecutionContext& ctx) const {
   InitDefaultKernelSignatureMap();
   ExecutionArgumentMappingContext arg_mapping_ctx(ctx);
@@ -1920,7 +1926,7 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
       arg_mapping_ctx);
 }
 
-Scope* OperatorWithKernel::PreparePtenData(
+Scope* OperatorWithKernel::PreparePhiData(
     const Scope& scope, const phi::Kernel& pt_kernel,
     const KernelSignature& pt_kernel_signature, RuntimeContext* ctx) const {
   auto& input_names = std::get<0>(pt_kernel_signature.args);
@@ -1972,12 +1978,15 @@ Scope* OperatorWithKernel::PreparePtenData(
         continue;
       }
 
-      auto expected_place = phi::TransToPtenPlace(in_def.backend);
+      if (in_def.backend == phi::Backend::ALL_BACKEND) {
+        continue;
+      }
+      auto expected_place = phi::TransToPhiPlace(in_def.backend);
       if (platform::is_same_place(tensor_in->place(), expected_place)) {
         continue;
       }
 
-      VLOG(3) << "PTen Transform Variable " << input_names[i] << " from "
+      VLOG(3) << "phi Transform Variable " << input_names[i] << " from "
               << tensor_in->place() << " to " << expected_place;
 
       if (!new_scope) {
@@ -1998,7 +2007,7 @@ Scope* OperatorWithKernel::PreparePtenData(
   return new_scope;
 }
 
-void OperatorWithKernel::BuildPtenKernelContext(
+void OperatorWithKernel::BuildPhiKernelContext(
     const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
     phi::KernelContext* pt_kernel_context) const {
   pt_kernel_context->SetDeviceContext(dev_ctx);
@@ -2037,7 +2046,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
         (i == 0 ? 0 : pt_kernel_context->InputRangeAt(i - 1).second);
 
     // deal with optional here
-    if ((it == ctx.inputs.end()) &&
+    if ((it == ctx.inputs.end() || it->second.size() == 0) &&
         (input_defs[i].type_index ==
          std::type_index(typeid(paddle::optional<const phi::DenseTensor&>)))) {
       pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr);
@@ -2102,7 +2111,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
       experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
                                                       output_defs.at(i));
       SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPtenPlace(output_defs.at(i).backend));
+          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
 
       pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
@@ -2136,10 +2145,10 @@ void OperatorWithKernel::BuildPtenKernelContext(
         auto& ins_vector = ctx.inputs.at(attr_names[i]);
         if (ins_vector.size() == 1) {  // ShapeTensor
           pt_kernel_context->EmplaceBackAttr(std::move(
-              experimental::MakePtenScalarArrayFromVar(*ins_vector.front())));
+              experimental::MakePhiScalarArrayFromVar(*ins_vector.front())));
         } else {  // ShapeTensorList
           pt_kernel_context->EmplaceBackAttr(std::move(
-              experimental::MakePtenScalarArrayFromVarList(ins_vector)));
+              experimental::MakePhiScalarArrayFromVarList(ins_vector)));
         }
       }
     } else if (attr_defs[i].type_index ==
@@ -2169,8 +2178,8 @@ void OperatorWithKernel::BuildPtenKernelContext(
         }
       } else {
         auto& ins_vector = ctx.inputs.at(attr_names[i]);
-        pt_kernel_context->EmplaceBackAttr(std::move(
-            experimental::MakePtenScalarFromVar(*ins_vector.front())));
+        pt_kernel_context->EmplaceBackAttr(
+            std::move(experimental::MakePhiScalarFromVar(*ins_vector.front())));
       }
 
     } else {
@@ -2189,7 +2198,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
         pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(phi::DataType))) {
-        auto data_type = paddle::framework::TransToPtenDataType(
+        auto data_type = paddle::framework::TransToPhiDataType(
             static_cast<framework::proto::VarType::Type>(
                 BOOST_GET_CONST(int, attr)));
         pt_kernel_context->EmplaceBackAttr(data_type);
@@ -2197,7 +2206,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
             std::type_index(typeid(std::vector<int>))) {
-          // Emplace Back Attr according to the type of Pten_Kernel args.
+          // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index ff9cb8a287a26210cb585c1c58dcb20e860af880..16718a316513e3574e9a7eb14ed50106c8b0dcb6 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -423,7 +423,7 @@ class ExecutionContext {
             "size(%d).",
             allocation_ptr->size(), phi::product(dim) * sizeof(T)));
 
-    paddle::framework::Tensor temp_tensor(framework::TransToPtenDataType(
+    paddle::framework::Tensor temp_tensor(framework::TransToPhiDataType(
         framework::ToDataType(std::type_index(typeid(T)))));
     temp_tensor.Resize(dim);
     temp_tensor.ResetHolder(std::move(shared_allocation));
@@ -538,14 +538,14 @@ class OperatorWithKernel : public OperatorBase {
   }
 
   bool SupportGPU() const override {
-    auto pten_kernels = phi::KernelFactory::Instance().SelectKernelMap(
-        phi::TransToPtenKernelName(type_));
-    auto has_pten_kernel =
-        std::any_of(pten_kernels.begin(), pten_kernels.end(),
+    auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
+        phi::TransToPhiKernelName(type_));
+    auto has_phi_kernel =
+        std::any_of(phi_kernels.begin(), phi_kernels.end(),
                     [](phi::KernelKeyMap::const_reference kern_pair) {
                       return kern_pair.first.backend() == phi::Backend::GPU;
                     });
-    if (has_pten_kernel) {
+    if (has_phi_kernel) {
       return true;
     } else {
       auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
@@ -558,7 +558,7 @@ class OperatorWithKernel : public OperatorBase {
   }
 
   bool SupportNPU() const override {
-    // TODO(zhiqiu): support pten if needed?
+    // TODO(zhiqiu): support phi if needed?
     auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
     return std::any_of(op_kernels.begin(), op_kernels.end(),
                        [](OpKernelMap::const_reference kern_pair) {
@@ -566,7 +566,7 @@ class OperatorWithKernel : public OperatorBase {
                        });
   }
   bool SupportMLU() const override {
-    // TODO(zhiqiu): support pten if needed?
+    // TODO(zhiqiu): support phi if needed?
     auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
     return std::any_of(op_kernels.begin(), op_kernels.end(),
                        [](OpKernelMap::const_reference kern_pair) {
@@ -603,39 +603,39 @@ class OperatorWithKernel : public OperatorBase {
     return kernel_type_->place_;
   }
 
-  /* member functions for adapting to pten lib */
+  /* member functions for adapting to phi lib */
   /** In the Tensor calculation library, the new Kernel adopts a clearer and
     * more streamlined design. The arguments of the Kernel and the input and
     * output arguments registered in the original OpMaker do not match in some
     * cases, so we use map to record the arguments required by the kernel.
     * When selecting Kernel during Op execution, select the arguments of the
-    * original Op according to the GetExpectedPtenKernelArgs returned arguments.
+    * original Op according to the GetExpectedPhiKernelArgs returned arguments.
     */
-  phi::KernelSignature GetExpectedPtenKernelArgs(
+  phi::KernelSignature GetExpectedPhiKernelArgs(
       const ExecutionContext& ctx) const;
 
-  /* member functions for adapting to pten lib */
-  phi::KernelKey ChoosePtenKernel(const ExecutionContext& ctx) const;
+  /* member functions for adapting to phi lib */
+  phi::KernelKey ChoosePhiKernel(const ExecutionContext& ctx) const;
 
   /**
-   * Transfer data place for pten kernel
+   * Transfer data place for phi kernel
    * Is this really needed?
    */
-  Scope* PreparePtenData(const Scope& scope, const phi::Kernel& pt_kernel,
-                         const phi::KernelSignature& pt_kernel_signature,
-                         RuntimeContext* ctx) const;
+  Scope* PreparePhiData(const Scope& scope, const phi::Kernel& pt_kernel,
+                        const phi::KernelSignature& pt_kernel_signature,
+                        RuntimeContext* ctx) const;
 
-  void BuildPtenKernelContext(const RuntimeContext& ctx,
-                              platform::DeviceContext* dev_ctx,
-                              phi::KernelContext* pt_kernel_context) const;
+  void BuildPhiKernelContext(const RuntimeContext& ctx,
+                             platform::DeviceContext* dev_ctx,
+                             phi::KernelContext* pt_kernel_context) const;
 
-  phi::KernelSignature* PtenKernelSignature() const {
+  phi::KernelSignature* PhiKernelSignature() const {
     return pt_kernel_signature_.get();
   }
 
-  phi::Kernel* PtenKernel() const { return pt_kernel_.get(); }
+  phi::Kernel* PhiKernel() const { return pt_kernel_.get(); }
 
-  void ResetPtenKernel(phi::Kernel* kernel) const {
+  void ResetPhiKernel(phi::Kernel* kernel) const {
     return pt_kernel_.reset(kernel);
   }
 
@@ -692,9 +692,9 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::mutex cache_update_mutex_;
   mutable bool enable_cache_transfer_scope_ = false;
   // NOTE(chenweihang): Similar op members are used to adapt to
-  // new pten kernel, if there is a better design in the future,
+  // new phi kernel, if there is a better design in the future,
   // we may polish the implementation here
-  mutable bool run_pten_kernel_ = false;
+  mutable bool run_phi_kernel_ = false;
   mutable bool run_kp_kernel = false;
   mutable std::unique_ptr<phi::KernelSignature> pt_kernel_signature_;
   mutable std::unique_ptr<phi::Kernel> pt_kernel_;
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index 3516e71b837917cae2d60193ec5e3798c9d1a211..6e55727c8bf67c18a0b27454eaa3c3f48ee9db89 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -44,11 +44,6 @@ DECLARE_string(deny_cinn_ops);
 
 namespace paddle {
 namespace framework {
-
-namespace ir {
-class MemOptVarInfo;
-}  // namespace ir
-
 namespace paddle2cinn {
 
 using framework::ir::Graph;
@@ -375,7 +370,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
       const std::unordered_set<std::string>& ignore_names) {
     auto result = std::make_unique<std::vector<std::string>>();
     for (auto* node : nodes) {
-      if (ignore_names.count(node->Name())) {
+      if (!node->Var() || ignore_names.count(node->Name())) {
         continue;
       }
       result->emplace_back(node->Name());
@@ -398,9 +393,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
       kNoNeedBufferFeeds, no_need_buffer_feeds.release());
   // initialize empty map for kMemOptVarInfoFromMainGraph attribute,
   // it will be filled on the share_mem_opt_info_to_subgraph pass
-  subgraph->GetOrInit<std::unordered_map<
-      std::string, std::shared_ptr<framework::ir::MemOptVarInfo>>>(
-      kMemOptVarInfoFromMainGraph);
+  subgraph->GetOrInit<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph);
   return subgraph;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
index 8cb920831cc543a073652051c1ba234e974179c3..a902eacde820fac8556c42b5b4ccbb6342c7bba8 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
@@ -18,6 +18,10 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+
 namespace paddle2cinn {
 
 constexpr char kCinnLaunchOp[] = "cinn_launch";
@@ -27,6 +31,9 @@ constexpr char kInternalVars[] = "InternalVars";
 constexpr char kOutputVars[] = "OutputVars";
 constexpr char kMemOptVarInfoFromMainGraph[] =
     "mem_opt_var_info_from_main_graph";
+using Name2VarInfoMap =
+    std::unordered_map<std::string,
+                       std::shared_ptr<framework::ir::MemOptVarInfo>>;
 
 // A pass named BuildCinnPass, the function of this pass is:
 //
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index 919fc60d4cb61b6079965e3c8ab7d43ca9a2b211..bf9d1baaf394f05d125563311dd2047383373834 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -255,7 +255,9 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) {
   ASSERT_EQ(
       std::unordered_set<Node*>(cinn_op->inputs.begin(), cinn_op->inputs.end()),
       std::unordered_set<Node*>({v0, v1, v2, v4}));
-  ASSERT_EQ(cinn_op->outputs, std::vector<Node*>({v6, v7}));
+  ASSERT_EQ(std::unordered_set<Node*>(cinn_op->outputs.begin(),
+                                      cinn_op->outputs.end()),
+            std::unordered_set<Node*>({v6, v7}));
   ASSERT_EQ(v1->outputs, std::vector<Node*>({cinn_op}));
   ASSERT_EQ(v6->inputs, std::vector<Node*>({cinn_op}));
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 716cd85e7117af4680f6cad908810ebdf6f5f973..706815185a1b5b53d1bb8e26274206abc126cfd5 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -248,10 +248,10 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   *compiled_obj = {std::move(graph_compiler),
                    std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
-  compiled_obj->launch_context =
-      std::make_unique<operators::details::CinnLaunchContext>(
-          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
   compiled_obj->cached_index = compiled_num;
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(graph,
+                                                              *compiled_obj);
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
index 09bca4a735461914e203cd479f45d000985a37b4..c0e1ca8f0d123379f3363afc45dd083b4a5dc951 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -209,7 +209,7 @@ class CinnGraphSymbolizationTest : public ::testing::Test {
       tensor.Resize(dims);
       tensor.mutable_data(
           platform::CPUPlace(),
-          framework::TransToPtenDataType(framework::proto::VarType::FP32));
+          framework::TransToPhiDataType(framework::proto::VarType::FP32));
       return tensor;
     };
 #define FillFeedList(Name) feed_targets[#Name] = create_tensor();
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/phi_utils.cc
similarity index 79%
rename from paddle/fluid/framework/pten_utils.cc
rename to paddle/fluid/framework/phi_utils.cc
index b96eb848e43a4e8ab6c323aa4361ed401dd9adf7..355291beb60f949b52b681592d42b7da4e80186b 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <sstream>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -57,17 +57,16 @@ class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker {
   paddle::SmallVector<std::string> attr_names_;
 };
 
-OpKernelType TransPtenKernelKeyToOpKernelType(
-    const phi::KernelKey& kernel_key) {
+OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
   proto::VarType::Type data_type =
       paddle::framework::TransToProtoVarType(kernel_key.dtype());
   // no need to set current device id here
-  platform::Place place = phi::TransToPtenPlace(kernel_key.backend(), false);
+  platform::Place place = phi::TransToPhiPlace(kernel_key.backend(), false);
   DataLayout data_layout = kernel_key.layout();
   LibraryType library_type = LibraryType::kPlain;
   if (kernel_key.backend() == phi::Backend::MKLDNN) {
     library_type = LibraryType::kMKLDNN;
-  } else if (kernel_key.backend() == phi::Backend::CUDNN) {
+  } else if (kernel_key.backend() == phi::Backend::GPUDNN) {
     library_type = LibraryType::kCUDNN;
   } else {
     // do nothing
@@ -76,19 +75,19 @@ OpKernelType TransPtenKernelKeyToOpKernelType(
   return OpKernelType(data_type, place, data_layout, library_type);
 }
 
-phi::KernelKey TransOpKernelTypeToPtenKernelKey(
+phi::KernelKey TransOpKernelTypeToPhiKernelKey(
     const OpKernelType& kernel_type) {
-  phi::Backend backend = phi::TransToPtenBackend(kernel_type.place_);
+  phi::Backend backend = phi::TransToPhiBackend(kernel_type.place_);
   if (kernel_type.library_type_ == LibraryType::kMKLDNN) {
     backend = phi::Backend::MKLDNN;
   } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
-    backend = phi::Backend::CUDNN;
+    backend = phi::Backend::GPUDNN;
   } else {
     // do
   }
   paddle::experimental::DataLayout layout = kernel_type.data_layout_;
   paddle::experimental::DataType dtype =
-      paddle::framework::TransToPtenDataType(kernel_type.data_type_);
+      paddle::framework::TransToPhiDataType(kernel_type.data_type_);
   return phi::KernelKey(backend, layout, dtype);
 }
 
@@ -98,8 +97,8 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
 #ifdef PADDLE_WITH_XPU
   if (platform::is_xpu_place(expected_kernel_key.place_) ||
       paddle::platform::is_in_xpu_black_list(op.Type())) {
-    VLOG(3) << "pten missing XPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
+    VLOG(3) << "phi missing XPU kernel: " << op.Type()
+            << "phipected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
                           kernel_key.dtype());
@@ -107,8 +106,8 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   if (platform::is_npu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "pten missing NPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
+    VLOG(3) << "phi missing NPU kernel: " << op.Type()
+            << "phipected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
                           kernel_key.dtype());
@@ -116,8 +115,8 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
 #endif
 #ifdef PADDLE_WITH_MLU
   if (platform::is_mlu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "pten missing MLU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
+    VLOG(3) << "phi missing MLU kernel: " << op.Type()
+            << "phipected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
                           kernel_key.dtype());
@@ -132,17 +131,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
     auto& in = op_proto_->inputs()[i];
     auto& in_name = in.name();
     if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(6) << "Parse PtenKernel input: skip extra & quant input - "
+      VLOG(6) << "Parse PhiKernel input: skip extra & quant input - "
               << in_name;
       continue;
     }
     // If contains dispensable input, we should override the
-    // GetExpectedPtenKernelArgs method self
+    // OpArgumentMapping method self in phi/ops/compat dir
     if (in.has_dispensable() && in.dispensable()) {
-      VLOG(6) << "Parse PtenKernel input: skip dispensable input - " << in_name;
+      VLOG(6) << "Parse PhiKernel input: skip dispensable input - " << in_name;
       continue;
     }
-    VLOG(6) << "Parse PtenKernel input: " << in_name;
+    VLOG(6) << "Parse PhiKernel input: " << in_name;
     input_names_.emplace_back(in_name);
   }
   return input_names_;
@@ -153,8 +152,12 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
   for (int i = 0; i < op_proto_->outputs_size(); ++i) {
     auto& out = op_proto_->outputs()[i];
     auto& out_name = out.name();
-    // TODO(chenweihang): outputs also need skip some cases
-    VLOG(6) << "Parse PtenKernel output: " << out_name;
+    if ((out.has_extra() && out.extra()) || (out.has_quant() && out.quant())) {
+      VLOG(6) << "Parse PhiKernel output: skip extra & quant output - "
+              << out_name;
+      continue;
+    }
+    VLOG(6) << "Parse PhiKernel output: " << out_name;
     output_names_.emplace_back(out_name);
   }
   return output_names_;
@@ -165,20 +168,21 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
   for (int i = 0; i < op_proto_->attrs_size(); ++i) {
     auto& attr = op_proto_->attrs()[i];
     auto& attr_name = attr.name();
-    if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
-        attr_name == "op_role_var" || attr_name == "op_namescope" ||
-        attr_name == "op_callstack" || attr_name == "op_device") {
-      VLOG(6) << "Parse PtenKernel attribute: skip needless attr - "
+    if (attr_name == "use_mkldnn" || attr_name == "use_cudnn" ||
+        attr_name == "op_role" || attr_name == "op_role_var" ||
+        attr_name == "op_namescope" || attr_name == "op_callstack" ||
+        attr_name == "op_device") {
+      VLOG(6) << "Parse PhiKernel attribute: skip needless attr - "
               << attr_name;
       continue;
     }
     if ((attr.has_extra() && attr.extra()) ||
         (attr.has_quant() && attr.quant())) {
-      VLOG(6) << "Parse PtenKernel attribute: skip extra & quant attr - "
+      VLOG(6) << "Parse PhiKernel attribute: skip extra & quant attr - "
               << attr_name;
       continue;
     }
-    VLOG(6) << "Parse PtenKernel attribute: " << attr_name;
+    VLOG(6) << "Parse PhiKernel attribute: " << attr_name;
     attr_names_.emplace_back(attr_name);
   }
 
@@ -186,7 +190,7 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }
 
 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(phi::TransToPtenKernelName(op_proto_->type()),
+  return KernelSignature(phi::TransToPhiKernelName(op_proto_->type()),
                          GetInputArgsNames(), GetAttrsArgsNames(),
                          GetOutputArgsNames());
 }
@@ -198,7 +202,7 @@ void InitDefaultKernelSignatureMap() {
     for (const auto& pair : paddle::framework::OpInfoMap::Instance().map()) {
       const auto& op_type = pair.first;
       const auto* op_proto = pair.second.proto_;
-      if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) &&
+      if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type) &&
           op_proto) {
         paddle::framework::KernelArgsNameMakerByOpProto maker(op_proto);
         VLOG(10) << "Register kernel signature for " << op_type;
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/phi_utils.h
similarity index 87%
rename from paddle/fluid/framework/pten_utils.h
rename to paddle/fluid/framework/phi_utils.h
index 1bcffbcc3143547eb1df0975c9e2163bfebed02e..1a1f79d82770058ae4010b7a3a3162280ceb1537 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -44,9 +44,8 @@ using KernelSignature = phi::KernelSignature;
 
 /* Kernel Key translate */
 
-OpKernelType TransPtenKernelKeyToOpKernelType(const phi::KernelKey& kernel_key);
-phi::KernelKey TransOpKernelTypeToPtenKernelKey(
-    const OpKernelType& kernel_type);
+OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key);
+phi::KernelKey TransOpKernelTypeToPhiKernelKey(const OpKernelType& kernel_type);
 phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
                              const phi::KernelKey& kernel_key,
                              const framework::OperatorBase& op);
@@ -68,25 +67,25 @@ void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
 
 // TODO(Wilber): support others device context.
 template <typename T>
-struct ConvertToPtenContext {
+struct ConvertToPhiContext {
   using TYPE = T;
 };
 
 template <>
-struct ConvertToPtenContext<platform::CPUDeviceContext> {
+struct ConvertToPhiContext<platform::CPUDeviceContext> {
   using TYPE = phi::CPUContext;
 };
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
-struct ConvertToPtenContext<platform::CUDADeviceContext> {
+struct ConvertToPhiContext<platform::CUDADeviceContext> {
   using TYPE = phi::GPUContext;
 };
 #endif
 
 #ifdef PADDLE_WITH_XPU
 template <>
-struct ConvertToPtenContext<platform::XPUDeviceContext> {
+struct ConvertToPhiContext<platform::XPUDeviceContext> {
   using TYPE = phi::XPUContext;
 };
 #endif
diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc
similarity index 52%
rename from paddle/fluid/framework/pten_utils_test.cc
rename to paddle/fluid/framework/phi_utils_test.cc
index 3c86372e6e7528908a51b83b611da53cd68cff79..cbcdf24c9f32b47f3337b4f176753328497d8c85 100644
--- a/paddle/fluid/framework/pten_utils_test.cc
+++ b/paddle/fluid/framework/phi_utils_test.cc
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
-TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
+TEST(PhiUtils, TransPhiKernelKeyToOpKernelType) {
   phi::KernelKey kernel_key(phi::Backend::CPU, phi::DataLayout::NCHW,
                             phi::DataType::FLOAT32);
   auto op_kernel_type =
-      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key);
+      paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key);
   ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
   ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
   ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_));
@@ -33,7 +33,7 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
   phi::KernelKey kernel_key_mkldnn(phi::Backend::MKLDNN, phi::DataLayout::NCHW,
                                    phi::DataType::FLOAT32);
   op_kernel_type =
-      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_mkldnn);
+      paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_mkldnn);
   ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
   ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
   ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_));
@@ -42,10 +42,10 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
 #endif
 
 #ifdef PADDLE_WITH_CUDA
-  phi::KernelKey kernel_key_cudnn(phi::Backend::CUDNN, phi::DataLayout::NCHW,
+  phi::KernelKey kernel_key_cudnn(phi::Backend::GPUDNN, phi::DataLayout::NCHW,
                                   phi::DataType::FLOAT32);
   op_kernel_type =
-      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_cudnn);
+      paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_cudnn);
   ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
   ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
   ASSERT_TRUE(paddle::platform::is_gpu_place(op_kernel_type.place_));
@@ -53,3 +53,38 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
             paddle::framework::LibraryType::kCUDNN);
 #endif
 }
+
+TEST(PhiUtils, TransOpKernelTypeToPhiKernelKey) {
+  paddle::framework::OpKernelType op_kernel_type(
+      paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(),
+      paddle::framework::DataLayout::kNCHW);
+  auto kernel_key =
+      paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type);
+  ASSERT_EQ(kernel_key.dtype(), phi::DataType::FLOAT32);
+  ASSERT_EQ(kernel_key.layout(), phi::DataLayout::NCHW);
+  ASSERT_EQ(kernel_key.backend(), phi::Backend::CPU);
+
+#ifdef PADDLE_WITH_MKLDNN
+  paddle::framework::OpKernelType op_kernel_type_mkldnn(
+      paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(),
+      paddle::framework::DataLayout::kMKLDNN,
+      paddle::framework::LibraryType::kMKLDNN);
+  auto kernel_key_mkldnn =
+      paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_mkldnn);
+  ASSERT_EQ(kernel_key_mkldnn.dtype(), phi::DataType::FLOAT32);
+  ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::MKLDNN);
+  ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::MKLDNN);
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::OpKernelType op_kernel_type_cudnn(
+      paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(),
+      paddle::framework::DataLayout::kNCHW,
+      paddle::framework::LibraryType::kCUDNN);
+  auto kernel_key_cudnn =
+      paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_cudnn);
+  ASSERT_EQ(kernel_key_cudnn.dtype(), phi::DataType::FLOAT32);
+  ASSERT_EQ(kernel_key_cudnn.layout(), phi::DataLayout::NCHW);
+  ASSERT_EQ(kernel_key_cudnn.backend(), phi::Backend::GPUDNN);
+#endif
+}
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index 4d34ba85517e163f966b49d118e5fdce50865419..e0cf860e5bc7b94872e612112a4d5977571db489 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <google/protobuf/text_format.h>
 #include <cstdlib>
 #include <string>
 #include <vector>
@@ -20,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
     (defined PADDLE_WITH_PSLIB)
@@ -44,6 +46,7 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
       dense_grad_names_[table_id][j] = table.dense_grad_name(j);
     }
   }
+  InitializeGPUServer(trainer_desc);
   scale_datanorm_ = trainer_desc.scale_datanorm();
   int place_num = trainer_desc.worker_places_size();
   const std::vector<paddle::framework::DataFeed*> readers =
@@ -84,6 +87,166 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
   return;
 }
 
+void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
+  // add for hbmps optimizer config
+  auto fleet_desc_str = trainer_desc.fleet_desc();
+  google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param);
+  auto sparse_table =
+      _ps_param.server_param().downpour_server_param().downpour_table_param(0);
+  auto sparse_table_accessor = sparse_table.accessor();
+  auto sparse_table_accessor_parameter =
+      sparse_table_accessor.downpour_accessor_param();
+  auto accessor_class = sparse_table_accessor.accessor_class();
+  // gpups' sparse table optimizer config
+  // now only support single sparse table
+  // auto sparse_table = param_.sparse_table(0);
+  std::unordered_map<std::string, float> config;
+  if (accessor_class == "DownpourFeatureValueAccessor" ||
+      accessor_class == "DownpourCtrAccessor" ||
+      accessor_class == "DownpourCtrDoubleAccessor") {
+    config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
+    config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
+    config["learning_rate"] =
+        sparse_table_accessor.sparse_sgd_param().learning_rate();
+    config["initial_g2sum"] =
+        sparse_table_accessor.sparse_sgd_param().initial_g2sum();
+    config["initial_range"] =
+        sparse_table_accessor.sparse_sgd_param().initial_range();
+    if (sparse_table_accessor.sparse_sgd_param().weight_bounds_size() == 2) {
+      config["min_bound"] =
+          sparse_table_accessor.sparse_sgd_param().weight_bounds()[0];
+      config["max_bound"] =
+          sparse_table_accessor.sparse_sgd_param().weight_bounds()[1];
+    }
+    config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+  } else if (accessor_class == "DownpourSparseValueAccessor") {
+    auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name();
+    if (optimizer_name == "naive") {
+      config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .naive()
+                                    .learning_rate();
+      config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .naive()
+                                    .initial_range();
+      if (sparse_table_accessor.sparse_commonsgd_param()
+              .naive()
+              .weight_bounds_size() == 2) {
+        config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .naive()
+                                  .weight_bounds()[0];
+        config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .naive()
+                                  .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adagrad") {
+      config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .adagrad()
+                                    .learning_rate();
+      config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .adagrad()
+                                    .initial_range();
+      config["initial_g2sum"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .adagrad()
+                                    .initial_g2sum();
+      if (sparse_table_accessor.sparse_commonsgd_param()
+              .adagrad()
+              .weight_bounds_size() == 2) {
+        config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adagrad()
+                                  .weight_bounds()[0];
+        config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adagrad()
+                                  .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adam") {
+      config["learning_rate"] =
+          sparse_table_accessor.sparse_commonsgd_param().adam().learning_rate();
+      config["initial_range"] =
+          sparse_table_accessor.sparse_commonsgd_param().adam().initial_range();
+      if (sparse_table_accessor.sparse_commonsgd_param()
+              .adam()
+              .weight_bounds_size() == 2) {
+        config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adam()
+                                  .weight_bounds()[0];
+        config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adam()
+                                  .weight_bounds()[1];
+      }
+    }
+  } else if (accessor_class == "DownpourUnitAccessor" ||
+             accessor_class == "DownpourDoubleUnitAccessor") {
+    config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
+    config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
+    auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name();
+    if (optimizer_name == "naive") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().naive().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().naive().initial_range();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .naive()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] =
+            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0];
+        config["mf_max_bound"] =
+            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adagrad") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
+      config["mf_initial_g2sum"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .adagrad()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[0];
+        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "std_adagrad") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
+      config["mf_initial_g2sum"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .adagrad()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[0];
+        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adam") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().adam().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().adam().initial_range();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .adam()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] =
+            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0];
+        config["mf_max_bound"] =
+            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1];
+      }
+    }
+    config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+  }
+
+  auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
+  ps_gpu_wrapper->InitializeGPUServer(config);
+}
+
 std::string PSGPUTrainer::GetDumpPath(int tid) {
   if (user_define_dump_filename_ != "") {
     return string::format_string("%s/part-%s-%05d", dump_fields_path_.c_str(),
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index b9a262105e47479fce8f5ae4f1ab6b852464d745..57eddf782f06bfce1d42c26e68c7789207bcf37f 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/stream.h"
 
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1eb5727298c39aba41b4efe832b10d363b6030ea..10ceae62dccbbab9329b73e0f581b51508511194 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1455,22 +1455,10 @@ std::ostream& print_tensor<paddle::platform::complex<double>>(
 }
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod) {
-  os << "{";
-  for (auto& v : lod) {
-    os << "{";
-    bool is_first = true;
-    for (auto& i : v) {
-      if (is_first) {
-        os << i;
-        is_first = false;
-      } else {
-        os << ", " << i;
-      }
-    }
-    os << "}";
-  }
-  os << "}";
-
+  // NOTE(xiongkun):
+  // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution
+  // if we don't redefine, the operator << of phi / framework LoD is not found.
+  paddle::string::operator<<(os, lod);
   return os;
 }
 
@@ -1479,6 +1467,11 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
 
 namespace phi {
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  paddle::string::operator<<(os, lod);
+  return os;
+}
+
 std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
   if (t.lod().size() > 0) {
     os << "  - lod: " << t.lod() << "\n";
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index c993895a9f0ea1ff5e592366136b4e3bba562bd8..8a11775702e57887015f831fcd4e3a3f91bd9d56 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -36,6 +36,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/phi/backends/dynload/port.h"
 
+#ifdef PADDLE_WITH_PSLIB
+#include <pslib.h>
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -267,6 +271,7 @@ class PSGPUTrainer : public TrainerBase {
 
   template <typename T>
   void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
+  void InitializeGPUServer(const TrainerDesc& trainer_desc);
 
  protected:
   Dataset* dataset_;
@@ -287,6 +292,9 @@ class PSGPUTrainer : public TrainerBase {
   int mpi_rank_;
   int mpi_size_;
   int dump_file_num_;
+
+  // _ps_param for gpups optimizer config
+  ::paddle::PSParameter _ps_param;
 };
 #endif
 
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 96d312437b34cf1fafc4fbcaeec91201a1fa934a..6fe33545aa22d3f17234dbb1b6cd8ad1bb719409 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -66,6 +66,9 @@ message TrainerDesc {
   repeated int32 trainers = 35;
   optional int32 trainer_id = 36;
 
+  // add for gpu
+  optional string fleet_desc = 37;
+
   // device worker parameters
   optional HogwildWorkerParameter hogwild_param = 101;
   optional DownpourWorkerParameter downpour_param = 103;
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index f649c9388f0f6518dc4f8a587f5c9f9c01451373..945b68438e1e702e7b2e6498a26b0a107c6640da 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -69,6 +69,12 @@ class InferVarTypeContext {
     return op_->Inputs().at(name).size();
   }
 
+  virtual size_t OutputSize(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(
+        op_, platform::errors::PreconditionNotMet("op_ should not be null"));
+    return op_->Outputs().at(name).size();
+  }
+
   virtual const std::string& InputVarName(const std::string& name,
                                           const int index = 0) const {
     PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 90cf0e76e000736f730121a6fcce841aa38a59ae..f198919b0c87bb4f2ea9991e401a8242676d3f46 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,11 +1,11 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
-cc_library(var_helper SRCS var_helper.cc DEPS tensor pten_api)
+cc_library(var_helper SRCS var_helper.cc DEPS tensor phi_api)
 IF(WITH_XPU)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten_api pten pten_utils var_helper)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi phi_utils var_helper)
 ELSE()
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten_api pten pten_utils var_helper)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi phi_utils var_helper)
 ENDIF()
-cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper pten_api)
+cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper phi_api)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper)
 cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper)
@@ -31,6 +31,9 @@ if(NOT WIN32)
         cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
         cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
+    if(WITH_CNCL)
+        cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits)
+    endif()
     if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL)
         cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
     endif()
@@ -44,9 +47,9 @@ if(WITH_GLOO)
 endif()
 
 if(NOT WITH_ASCEND_CL)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function pten_tensor)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor)
 else()
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner pten_tensor)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor)
 endif()
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 24a8ffbabf526ca779511f620648c64fcbb59cca..436e22f00c303d59652db33a723fe727b63657ef 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -90,6 +90,7 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
       platform::DeviceContextPool::Instance().Get(place));
 
   bool use_calc_stream = (dev_ctx->stream() == stream);
+  VLOG(4) << "Is use calculate stream: " << use_calc_stream;
 
   // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
   // but we can use other ways to implement is in the future
@@ -97,7 +98,9 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
   framework::Vector<int64_t> rows_num_vector(strategy.nranks_);
   rows_num_vector[strategy.local_rank_] = static_cast<int64_t>(src_rows.size());
   // CUDAMutableData use CalStream
-  auto *gpu_rows_num_ptr = rows_num_vector.CUDAMutableData(place);
+  paddle::framework::MixVector<int64_t> mixv_rows_num_vector(&rows_num_vector);
+  auto *gpu_rows_num_ptr = mixv_rows_num_vector.CUDAMutableData(place);
+  VLOG(4) << "start dev_ctx->wait";
   if (!use_calc_stream) {
     dev_ctx->Wait();
   }
@@ -109,6 +112,7 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
     platform::GpuStreamSync(stream);
   }
 
+  mixv_rows_num_vector.CopyToCPU();
   const auto *cpu_rows_num_ptr = rows_num_vector.data();
   auto rows_num =
       std::accumulate(cpu_rows_num_ptr, cpu_rows_num_ptr + strategy.nranks_,
@@ -121,8 +125,10 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
 
   auto *dst_rows = dst->mutable_rows();
   dst_rows->resize(rows_num);
-  auto *dst_rows_ptr = dst_rows->CUDAMutableData(place);
-  const auto *src_rows_ptr = src_rows.CUDAData(place);
+  paddle::framework::MixVector<int64_t> mixv_dst_rows(dst_rows);
+  auto *dst_rows_ptr = mixv_dst_rows.CUDAMutableData(place);
+  paddle::framework::MixVector<int64_t> mixv_src_rows(&src_rows);
+  const auto *src_rows_ptr = mixv_src_rows.CUDAData(place);
 
   auto *dst_tensor = dst->mutable_value();
   auto dims = src_tensor.dims();
@@ -150,24 +156,28 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype,
         comm->comm(), stream));
-    return;
-  }
-  for (int i = 0; i < strategy.nranks_; ++i) {
-    if (cpu_rows_num_ptr[i] > 0) {
-      // 2. Broadcast the rows of SelectedRows
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
-          src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i],
-          ncclInt64, i, comm->comm(), stream));
-      // 3. Broadcast the tensor data of SelectedRows
-      auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
-                               row_offset * feature_size * sizeof_dtype;
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
-          src_tensor_ptr, dst_tensor_ptr_i, cpu_rows_num_ptr[i] * feature_size,
-          nccl_dtype, i, comm->comm(), stream));
-      row_offset += cpu_rows_num_ptr[i];
+  } else {
+    for (int i = 0; i < strategy.nranks_; ++i) {
+      if (cpu_rows_num_ptr[i] > 0) {
+        // 2. Broadcast the rows of SelectedRows
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
+            src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i],
+            ncclInt64, i, comm->comm(), stream));
+        // 3. Broadcast the tensor data of SelectedRows
+        auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
+                                 row_offset * feature_size * sizeof_dtype;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
+            src_tensor_ptr, dst_tensor_ptr_i,
+            cpu_rows_num_ptr[i] * feature_size, nccl_dtype, i, comm->comm(),
+            stream));
+        row_offset += cpu_rows_num_ptr[i];
+      }
     }
   }
-
+  if (!use_calc_stream) {
+    platform::GpuStreamSync(stream);
+  }
+  mixv_dst_rows.CopyToCPU();
   VLOG(3) << "Original SelectedRows rows: "
           << string::join_strings(src_rows, ',');
   VLOG(3) << "Result SelectedRows rows: "
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 94c6d0a4d569a1ce458ed3590385de446d0ee150..149202468be6c6bec833f100adfd4100c520f8f3 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -70,12 +70,12 @@ OpSupportedInfos(const std::string& place,
     }
   }
 
-  auto pten_kernels = phi::KernelFactory::Instance().kernels();
-  for (auto& kernel_pair : pten_kernels) {
+  auto phi_kernels = phi::KernelFactory::Instance().kernels();
+  for (auto& kernel_pair : phi_kernels) {
     auto op_type = phi::TransToFluidOpName(kernel_pair.first);
     for (auto& info_pair : kernel_pair.second) {
       framework::OpKernelType kernel_type =
-          framework::TransPtenKernelKeyToOpKernelType(info_pair.first);
+          framework::TransPhiKernelKeyToOpKernelType(info_pair.first);
       if (is_target_place[query_place](kernel_type.place_) &&
           kernel_type.data_type_ == dtype && all_ops.count(op_type)) {
         VLOG(4) << op_type << " " << supported_ops.size();
@@ -273,8 +273,9 @@ static inline std::shared_ptr<VarType> CastToBF16(
 
 template <typename VarType>
 static inline framework::proto::VarType::Type GetPromoteType(
-    const std::string& op_type, const NameVarMap<VarType>& ins) {
-  auto dst_type = framework::proto::VarType::FP16;
+    const std::string& op_type, const NameVarMap<VarType>& ins,
+    const framework::proto::VarType::Type amp_dtype) {
+  auto dst_type = amp_dtype;
   for (const auto& pair : ins) {
     for (const auto& var : pair.second) {
       if (GetDataType<VarType>(var) == framework::proto::VarType::FP32) {
@@ -337,7 +338,8 @@ NameVarMap<VarType> AutoCastInputs(const std::string& op_type,
     }
     return new_ins;
   } else {
-    auto dst_type = GetPromoteType<VarType>(op_type, ins);
+    auto dst_type =
+        GetPromoteType<VarType>(op_type, ins, framework::proto::VarType::FP16);
 
     // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
     if (dst_type == framework::proto::VarType::FP16 &&
@@ -435,7 +437,7 @@ NameVarMap<VarType> AutoCastBF16Inputs(const std::string& op_type,
       }
     }
     return new_ins;
-  } else {
+  } else if (AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
     for (auto& pair : new_ins) {
       VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
               << GetDtypeStr(*pair.second.cbegin()) << " to float";
@@ -444,6 +446,26 @@ NameVarMap<VarType> AutoCastBF16Inputs(const std::string& op_type,
       }
     }
     return new_ins;
+  } else {
+    auto dst_type =
+        GetPromoteType<VarType>(op_type, ins, framework::proto::VarType::BF16);
+    // NOTE(zhangbo): if the op has op fp16 kernel, fall back to fp32.
+    if (dst_type == framework::proto::VarType::BF16 &&
+        AmpOperators::Instance().GetMutableUnsupportedBf16Ops()->count(
+            op_type)) {
+      dst_type = framework::proto::VarType::FP32;
+    }
+    for (auto& pair : new_ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to "
+              << framework::DataTypeToString(dst_type);
+      for (auto& var : pair.second) {
+        var = (dst_type == framework::proto::VarType::FP32
+                   ? CastToFP32<VarType>(var)
+                   : CastToBF16<VarType>(var));
+      }
+    }
+    return new_ins;
   }
   return new_ins;
 }
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 97a188e5c9c2712c2c6d819b7e8f0c5ca0b2a47a..8373c7fe50d0222d6b38a400e82239dc8c3590ad 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -154,7 +154,7 @@ void BasicEngine::CheckBackwardInputs(const OpBase& op) {
         // Here, we use the type of the corresponding forward datatype.
 
         tensor->mutable_data(
-            op.place(), framework::TransToPtenDataType(var->ForwardDataType()));
+            op.place(), framework::TransToPhiDataType(var->ForwardDataType()));
         VLOG(6) << "Set ungenerated Grad: " << var->Name()
                 << " as zero with dtype "
                 << framework::DataTypeToString(var->ForwardDataType());
diff --git a/paddle/fluid/imperative/cncl_context.cc b/paddle/fluid/imperative/cncl_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..779b748c2d2d43db1019bf60d063a21eb209b6bb
--- /dev/null
+++ b/paddle/fluid/imperative/cncl_context.cc
@@ -0,0 +1,237 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/imperative/cncl_context.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace imperative {
+
+static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
+                      const mluStream stream, const platform::CNCLComm *comm) {
+  const auto &place = src.place();
+  PADDLE_ENFORCE_EQ(
+      platform::is_mlu_place(place), true,
+      platform::errors::Unimplemented(
+          "Imperative mode does not support multi-CPU training yet."));
+
+  const void *src_ptr = src.data();
+  dst->Resize(src.dims());
+  auto *dst_ptr = dst->mutable_data(src.place(), src.dtype());
+  auto cncl_dtype =
+      platform::ToCNCLDataType(framework::TransToProtoVarType(src.dtype()));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(src_ptr, dst_ptr, src.numel(),
+                                           cncl_dtype, cnclSum, comm->comm(),
+                                           stream));
+}
+
+void CNCLParallelContext::BcastCNCLId(
+    std::vector<cnclCliqueId> &cncl_ids,  // NOLINT
+    int root, int server_fd) {
+  if (strategy_.local_rank_ == root) {
+    std::vector<std::string> other_trainers;
+    for (auto &ep : strategy_.trainer_endpoints_) {
+      if (ep != strategy_.current_endpoint_) {
+        other_trainers.push_back(ep);
+      }
+    }
+    platform::SendBroadCastCommID(other_trainers, &cncl_ids);
+  } else {
+    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
+                                  &cncl_ids);
+  }
+}
+
+void CNCLParallelContext::Init() {
+  int server_fd = -1;
+
+  std::vector<cnclCliqueId> cncl_ids;
+  cncl_ids.resize(strategy_.nrings_);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique cnclid on the root worker
+    for (size_t i = 0; i < cncl_ids.size(); ++i) {
+      PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCliqueId(&cncl_ids[i]));
+    }
+  } else {
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
+  }
+  BcastCNCLId(cncl_ids, 0, server_fd);
+
+  int mlu_id = place_.device;
+  for (int ring_id = 0; ring_id < strategy_.nrings_; ++ring_id) {
+    VLOG(0) << "init cncl context nranks: " << strategy_.nranks_
+            << " local rank: " << strategy_.local_rank_ << " mlu id: " << mlu_id
+            << " ring id: " << ring_id;
+    // it will assign cncl_comm in MLUDeviceContext within ring_id
+    platform::CNCLCommContext::Instance().CreateComm(
+        &cncl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, mlu_id,
+        ring_id);
+
+    compute_events_.emplace_back(
+        platform::MluEventResourcePool::Instance().New(place_.device));
+    comm_events_.emplace_back(
+        platform::MluEventResourcePool::Instance().New(place_.device));
+  }
+}
+
+void CNCLParallelContext::InitWithRingID(int ring_id) {
+  int server_fd = -1;
+  std::vector<cnclCliqueId> cncl_ids;
+  cncl_ids.resize(1);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique cnclid on the root worker
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCliqueId(&cncl_ids[0]));
+  } else {
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
+  }
+  BcastCNCLId(cncl_ids, 0, server_fd);
+
+  int mlu_id = place_.device;
+  VLOG(0) << "init cncl context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " mlu id: " << mlu_id
+          << " ring id: " << ring_id;
+  // it will assign cncl_comm in MLUDeviceContext within ring_id
+  platform::CNCLCommContext::Instance().CreateComm(
+      &cncl_ids[0], strategy_.nranks_, strategy_.local_rank_, mlu_id, ring_id);
+
+  compute_events_.emplace_back(
+      platform::MluEventResourcePool::Instance().New(place_.device));
+  comm_events_.emplace_back(
+      platform::MluEventResourcePool::Instance().New(place_.device));
+}
+
+void CNCLParallelContext::AllReduceByStream(const framework::Variable &src,
+                                            framework::Variable *dst,
+                                            int ring_id, bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_mlu_place(place_), true,
+      platform::errors::Unimplemented(
+          "Dynamic graph mode does not support multi-CPU training yet."));
+  auto *dev_ctx = static_cast<platform::MLUDeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  platform::CNCLComm *comm =
+      platform::CNCLCommContext::Instance().Get(ring_id, place_);
+  mluStream stream = (use_calc_stream ? dev_ctx->stream() : comm->stream());
+
+  if (src.IsType<framework::LoDTensor>()) {
+    if (!dst->IsType<framework::LoDTensor>()) {
+      dst->Clear();
+    }
+    AllReduce(src.Get<framework::LoDTensor>(),
+              dst->GetMutable<framework::LoDTensor>(), stream, comm);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unsupported variable type %s for imperative allreduce, only "
+        "LoDTensor is supported.",
+        platform::demangle(framework::ToTypeName(src.Type()))));
+  }
+}
+
+void CNCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
+  framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+  const auto &place = src_tensor->place();
+  platform::CNCLComm *comm =
+      platform::CNCLCommContext::Instance().Get(ring_id, place);
+  mluStream stream = comm->stream();
+
+  void *src_ptr = src_tensor->data();
+  auto cncl_dtype = platform::ToCNCLDataType(
+      framework::TransToProtoVarType(src_tensor->dtype()));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(src_ptr, src_tensor->numel(), cncl_dtype,
+                                       0, comm->comm(), stream));
+}
+
+paddle::platform::DeviceContext *CNCLParallelContext::GetDeviceContext(
+    int ring_id) {
+  return static_cast<platform::DeviceContext *>(
+      platform::CNCLCommContext::Instance()
+          .Get(ring_id, place_)
+          ->dev_context());
+}
+
+void CNCLParallelContext::WaitCompute(int ring_id) {
+  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
+                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_LT(ring_id, compute_events_.size(),
+                    platform::errors::OutOfRange(
+                        "ring id must < compute events size,"
+                        "but got ring id = %d, compute events size = %d",
+                        ring_id, compute_events_.size()));
+
+  auto compute_stream = static_cast<platform::MLUDeviceContext *>(
+                            platform::DeviceContextPool::Instance().Get(place_))
+                            ->stream();
+  auto comm_stream =
+      platform::CNCLCommContext::Instance().Get(ring_id, place_)->stream();
+  auto event = compute_events_[ring_id].get();
+
+  // compute_stream-->event-->comm_stream
+  PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event, compute_stream));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueWaitNotifier(event, comm_stream, 0));
+}
+
+void CNCLParallelContext::WaitComm(int ring_id) {
+  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
+                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_LT(ring_id, comm_events_.size(),
+                    platform::errors::OutOfRange(
+                        "ring id must < comm events size,"
+                        "but got ring id = %d, comm events size = %d",
+                        ring_id, comm_events_.size()));
+
+  auto compute_stream = static_cast<platform::MLUDeviceContext *>(
+                            platform::DeviceContextPool::Instance().Get(place_))
+                            ->stream();
+  auto comm_stream =
+      platform::CNCLCommContext::Instance().Get(ring_id, place_)->stream();
+  auto event = comm_events_[ring_id].get();
+
+  // comm_stream-->event-->compute_stream
+  PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event, comm_stream));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueWaitNotifier(event, compute_stream, 0));
+}
+
+void CNCLParallelContext::SynchronizeCompute() {
+  auto *compute_dev_ctx = static_cast<platform::MLUDeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  compute_dev_ctx->Wait();
+}
+
+}  //  namespace imperative
+}  //  namespace paddle
+
+#endif
diff --git a/paddle/fluid/imperative/cncl_context.h b/paddle/fluid/imperative/cncl_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..85f53319bfcde909f8ddc42ad1640a6b5269632d
--- /dev/null
+++ b/paddle/fluid/imperative/cncl_context.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if defined(PADDLE_WITH_CNCL)
+#include <cncl.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/imperative/parallel_context.h"
+#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace imperative {
+
+class CNCLParallelContext : public ParallelContext {
+ public:
+  explicit CNCLParallelContext(const ParallelStrategy& strategy,
+                               const platform::Place& place)
+      : ParallelContext(strategy, place) {}
+
+  ~CNCLParallelContext() override = default;
+
+  void BcastCNCLId(std::vector<cnclCliqueId>& cncl_ids, int root,  // NOLINT
+                   int server_fd);
+
+  void Init() override;
+
+  void InitWithRingID(int ring_id) override;
+
+  void AllReduceByStream(const framework::Variable& src,
+                         framework::Variable* dst, int ring_id,
+                         bool use_calc_stream) override;
+
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
+  paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
+
+  void WaitCompute(int ring_id) override;
+
+  void WaitComm(int ring_id) override;
+
+  void SynchronizeCompute() override;
+
+ private:
+  // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
+  std::vector<std::shared_ptr<platform::MluEventObject>> compute_events_;
+
+  // used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
+  std::vector<std::shared_ptr<platform::MluEventObject>> comm_events_;
+};
+
+}  //  namespace imperative
+}  //  namespace paddle
+#endif
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 8997966165769cac1c89ad7c8846cdd13bbc2348..dd34b8b619f80a0e7cb5f122d10850482b1b74ad 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -143,7 +143,7 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
   auto dtype = framework::TransToProtoVarType(src_tensor.dtype());
   // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
   // but we can use other ways to implement is in the future
-  const auto &src_rows = src.rows();
+  auto &src_rows = src.rows();
   auto gloo_wrapper = framework::GlooWrapper::GetInstance();
   size_t local_row_num = src_rows.size();
   std::vector<size_t> rows_num_vector =
@@ -157,8 +157,10 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
           << ", height: " << src.height();
   auto *dst_rows = dst->mutable_rows();
   dst_rows->resize(rows_num);
-  auto *dst_rows_ptr = dst_rows->MutableData(place);
-  const int64_t *src_rows_ptr = src_rows.Data(place);
+  paddle::framework::MixVector<int64_t> mixv_dst_rows(dst_rows);
+  auto *dst_rows_ptr = mixv_dst_rows.MutableData(place);
+  paddle::framework::MixVector<int64_t> mixv_src_rows(&src_rows);
+  const int64_t *src_rows_ptr = mixv_src_rows.Data(place);
 
   auto *dst_tensor = dst->mutable_value();
   auto dims = src_tensor.dims();
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 3587736a851da57cab6892593a5087dcdd338622..0abc5ad90e2697eb78ff1e21ceb2bc0e97e14a44 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -791,13 +791,13 @@ void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                 << var->Var().Get<framework::LoDTensor>().dims();
         tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
         tensor->mutable_data(place,
-                             framework::TransToPtenDataType(var->DataType()));
+                             framework::TransToPhiDataType(var->DataType()));
         phi::funcs::set_constant(*dev_ctx, tensor, 0.0);
       } else {
         auto* tensor =
             dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
         tensor->mutable_data(place,
-                             framework::TransToPtenDataType(var->DataType()));
+                             framework::TransToPhiDataType(var->DataType()));
         phi::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
@@ -925,13 +925,13 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                 << var->Var().Get<framework::LoDTensor>().dims();
         tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
         tensor->mutable_data(place,
-                             framework::TransToPtenDataType(var->DataType()));
+                             framework::TransToPhiDataType(var->DataType()));
         phi::funcs::set_constant(*dev_ctx, tensor, 0.0);
       } else {
         auto* tensor =
             dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
         tensor->mutable_data(place,
-                             framework::TransToPtenDataType(var->DataType()));
+                             framework::TransToPhiDataType(var->DataType()));
         phi::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index f1d0c8afdd50e3868423a9906d9955d7aea66983..56ddbf338619890f8a88bdf09a0bb770ec31bb2f 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -314,10 +314,10 @@ static void FillConstantLike(const VariableWrapper &ref_var,
   // default data_type for now.
   if (ref_var.ForwardDataType() != -1) {
     dst_tensor->mutable_data(
-        place, framework::TransToPtenDataType(ref_var.ForwardDataType()));
+        place, framework::TransToPhiDataType(ref_var.ForwardDataType()));
   } else {
-    dst_tensor->mutable_data(
-        place, framework::TransToPtenDataType(ref_var.DataType()));
+    dst_tensor->mutable_data(place,
+                             framework::TransToPhiDataType(ref_var.DataType()));
   }
   phi::funcs::set_constant(*dev_ctx, dst_tensor, value);
 }
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 05218ba961fdd115bd0d28755ce14e03a1c01003..9dd1dacc02c25474803ef3177d9cd967ee681714 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -121,7 +121,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       kernel_type_(kernel_type),
       func_(nullptr),
       dev_ctx_(dev_ctx),
-      run_pten_kernel_(true),
+      run_phi_kernel_(true),
       pt_kernel_signature_(kernel_signature),
       pt_kernel_(pt_kernel) {}
 
@@ -151,7 +151,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
   // NOTE(zhiqiu): for kernels on given device, for example NPU, the order to
   // choose is:
-  // pten npu kernel > fluid npu kernel > pten cpu kernel > fluid cpu kernel
+  // phi npu kernel > fluid npu kernel > phi cpu kernel > fluid cpu kernel
 
   // 1. get expected kernel key
   auto dygraph_exe_ctx = DygraphExecutionContext<VarType>(
@@ -161,16 +161,27 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   framework::KernelSignature pt_kernel_signature;
   phi::KernelKey pt_kernel_key;
   std::string pt_kernel_name;
-  if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
-    pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
+#ifdef PADDLE_WITH_XPU
+  bool is_xpu_unsupport =
+      paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
+          !paddle::platform::is_xpu_support_op(op.Type(),
+                                               expected_kernel_key) ||
+      paddle::platform::is_in_xpu_black_list(op.Type());
+#endif
+  if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) {
+    pt_kernel_signature = op.GetExpectedPhiKernelArgs(dygraph_exe_ctx);
     VLOG(6) << pt_kernel_signature;
 
     pt_kernel_name = pt_kernel_signature.name;
-    pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key);
+    pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key);
     auto pt_kernel = phi::KernelFactory::Instance().SelectKernel(pt_kernel_name,
                                                                  pt_kernel_key);
 
-    if (pt_kernel.IsValid()) {
+    if (pt_kernel.IsValid()
+#ifdef PADDLE_WITH_XPU
+        && !is_xpu_unsupport
+#endif
+        ) {
       VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
               << " | kernel key: " << pt_kernel_key
               << " | kernel: " << pt_kernel;
@@ -184,7 +195,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
       return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
                         pt_kernel, dev_ctx);
     } else {
-      VLOG(6) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
+      VLOG(6) << "Dynamic mode ChoosePhiKernel - kernel `" << pt_kernel_name
               << "` not found.";
     }
   }
@@ -197,14 +208,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
        kernels_iter->second.find(expected_kernel_key) ==
            kernels_iter->second.end())
 #ifdef PADDLE_WITH_XPU
-      ||
-      paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-          !paddle::platform::is_xpu_support_op(op.Type(),
-                                               expected_kernel_key) ||
-      paddle::platform::is_in_xpu_black_list(op.Type())
+      || is_xpu_unsupport
 #endif
-          ) {
-    if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
+      ) {
+    if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) {
       auto pt_cpu_kernel_key =
           FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
       auto pt_cpu_kernel = phi::KernelFactory::Instance().SelectKernel(
@@ -230,9 +237,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
 #ifdef PADDLE_WITH_XPU
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-      (kernel_iter == kernels.end() ||
-       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
-       paddle::platform::is_in_xpu_black_list(op.Type()))) {
+      (kernel_iter == kernels.end() || is_xpu_unsupport)) {
     VLOG(3) << "missing XPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
@@ -418,12 +423,12 @@ static void PreparedOpRunPtImpl(
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
 
-    PreparePtenData<VarType>(pt_kernel, pt_kernel_signature, ins);
+    PreparePhiData<VarType>(pt_kernel, pt_kernel_signature, ins);
 
     phi::KernelContext pt_kernel_context;
-    BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
-                                           outs, attrs, default_attrs, dev_ctx,
-                                           &pt_kernel_context);
+    BuildDygraphPhiKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
+                                          outs, attrs, default_attrs, dev_ctx,
+                                          &pt_kernel_context);
 
     pt_kernel(&pt_kernel_context);
   }
@@ -446,7 +451,7 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const NameVarMap<VarBase>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_pten_kernel_) {
+  if (run_phi_kernel_) {
     PreparedOpRunPtImpl<VarBase>(op_, kernel_type_, pt_kernel_signature_,
                                  pt_kernel_, dev_ctx_, ins, outs, attrs,
                                  default_attrs);
@@ -460,7 +465,7 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const NameVarMap<VariableWrapper>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_pten_kernel_) {
+  if (run_phi_kernel_) {
     PreparedOpRunPtImpl<VariableWrapper>(
         op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins,
         outs, attrs, default_attrs);
@@ -474,7 +479,7 @@ void PreparedOp::Run(const NameVarMap<egr::EagerVariable>& ins,
                      const NameVarMap<egr::EagerVariable>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_pten_kernel_) {
+  if (run_phi_kernel_) {
     PreparedOpRunPtImpl<egr::EagerVariable>(
         op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins,
         outs, attrs, default_attrs);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 589c8edd446bdb8eaf56d43826c7c5305829965b..8e1e2fbe9a12da672a633075ed4c41d3d62cd7e1 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -201,9 +201,9 @@ class PreparedOp {
   framework::OperatorWithKernel::OpKernelFunc func_;
   platform::DeviceContext* dev_ctx_;
   // NOTE(chenweihang): Similar op members are used to adapt to
-  // new pten kernel, if there is a better design in the future,
+  // new phi kernel, if there is a better design in the future,
   // we may polish the implementation here
-  bool run_pten_kernel_{false};
+  bool run_phi_kernel_{false};
   bool run_kp_kernel_{false};
   framework::KernelSignature pt_kernel_signature_;
   phi::Kernel pt_kernel_;
@@ -225,7 +225,7 @@ const inline framework::Attribute& GetAttr(
 }
 
 template <typename VarType>
-void BuildDygraphPtenKernelContext(
+void BuildDygraphPhiKernelContext(
     const framework::KernelSignature& pt_kernel_signature,
     const phi::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
@@ -327,7 +327,7 @@ void BuildDygraphPtenKernelContext(
       experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
                                                       output_defs.at(i));
       framework::SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPtenPlace(output_defs.at(i).backend));
+          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
 
       kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
@@ -369,7 +369,7 @@ void BuildDygraphPtenKernelContext(
         auto& ins_vector = ins.at(attr_names[i]);
         if (ins_vector.size() == 1) {  // ShapeTensor
           kernel_ctx->EmplaceBackAttr(std::move(
-              experimental::MakePtenScalarArrayFromVar(ins_vector[0]->Var())));
+              experimental::MakePhiScalarArrayFromVar(ins_vector[0]->Var())));
         } else {  // ShapeTensorList
           std::vector<framework::Variable*> variables;
           variables.reserve(ins_vector.size());
@@ -377,7 +377,7 @@ void BuildDygraphPtenKernelContext(
             variables.push_back(var_base->MutableVar());
           }
           kernel_ctx->EmplaceBackAttr(std::move(
-              experimental::MakePtenScalarArrayFromVarList(variables)));
+              experimental::MakePhiScalarArrayFromVarList(variables)));
         }
       }
     } else if (attr_defs[i].type_index ==
@@ -409,7 +409,7 @@ void BuildDygraphPtenKernelContext(
       } else {  // scalar is in the input
         auto& ins_vector = ins.at(attr_names[i]);
         kernel_ctx->EmplaceBackAttr(std::move(
-            experimental::MakePtenScalarFromVar(ins_vector[0]->Var())));
+            experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
       }
 
     } else {
@@ -428,7 +428,7 @@ void BuildDygraphPtenKernelContext(
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(phi::DataType))) {
-        auto data_type = framework::TransToPtenDataType(
+        auto data_type = framework::TransToPhiDataType(
             static_cast<framework::proto::VarType::Type>(
                 BOOST_GET_CONST(int, attr)));
         kernel_ctx->EmplaceBackAttr(data_type);
@@ -436,13 +436,15 @@ void BuildDygraphPtenKernelContext(
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
             std::type_index(typeid(std::vector<int>))) {
-          // Emplace Back Attr according to the type of Pten_Kernel args.
+          // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
           kernel_ctx->EmplaceBackAttr(vector_int64_attr);
         }
-        // TODO(YuanRisheng) Need support vector<int64_t> attr
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<int>))) {
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<int>, attr));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported cast op attribute `%s` when construct "
@@ -454,9 +456,9 @@ void BuildDygraphPtenKernelContext(
 }
 
 template <typename VarType>
-void PreparePtenData(const phi::Kernel& pt_kernel,
-                     const framework::KernelSignature& pt_kernel_signature,
-                     const NameVarMap<VarType>& ins) {
+void PreparePhiData(const phi::Kernel& pt_kernel,
+                    const framework::KernelSignature& pt_kernel_signature,
+                    const NameVarMap<VarType>& ins) {
   auto& input_names = std::get<0>(pt_kernel_signature.args);
   auto& input_defs = pt_kernel.args_def().input_defs();
 
@@ -477,12 +479,15 @@ void PreparePtenData(const phi::Kernel& pt_kernel,
       auto var = ins_vector[offset];
       const auto* tensor_in = GetTensorFromVar(var->Var());
       if (tensor_in && tensor_in->IsInitialized()) {
-        auto expected_place = phi::TransToPtenPlace(in_def.backend);
+        if (in_def.backend == phi::Backend::ALL_BACKEND) {
+          continue;
+        }
+        auto expected_place = phi::TransToPhiPlace(in_def.backend);
         if (platform::is_same_place(tensor_in->place(), expected_place)) {
           continue;
         }
 
-        VLOG(3) << "Pten Transform Variable " << input_names[i] << " from "
+        VLOG(3) << "Phi Transform Variable " << input_names[i] << " from "
                 << tensor_in->place() << " to " << expected_place;
 
         framework::Tensor tmp_tensor;
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 8681382394b9eea65ddcd8977c96e8a517516edd..3a6365b2af21ae9012fe37293699caed9bb23855 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -446,7 +446,7 @@ void Reducer::InitializeGroups(
       InitializeDenseGroups(variable_indices_, &group);
       auto tensor = group.dense_contents_.GetMutable<framework::LoDTensor>();
       tensor->Resize(phi::make_ddim({group.all_length_}))
-          .mutable_data(place_, framework::TransToPtenDataType(group.dtype_));
+          .mutable_data(place_, framework::TransToPhiDataType(group.dtype_));
     }
 
     // map variables to this group by VariableLocator
@@ -738,7 +738,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
       if (!group_tensor.IsInitialized()) {
         group_tensor.Resize({static_cast<int64_t>(length)});
         group_tensor.mutable_data(place_,
-                                  framework::TransToPtenDataType(group.dtype_));
+                                  framework::TransToPhiDataType(group.dtype_));
       }
 
 #ifdef PADDLE_WITH_XPU_BKCL
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index 774bb9653e2cba5c27f9037ee905e70175375339..e4f1cfdb3baeed9b5945b7843b6593528df48c29 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -9,10 +9,13 @@ else()
     if (WITH_XPU_BKCL)
         cc_test(bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context)
     endif()
+    if (WITH_CNCL)
+        cc_test(cncl_context_test SRCS cncl_context_test.cc DEPS cncl_context)
+    endif()
 endif(WIN32)
 
 
-cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function pten_tensor pten_api pten_api_utils)
+cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function phi_tensor phi_api phi_api_utils)
 cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy)
 cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place)
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
diff --git a/paddle/fluid/imperative/tests/cncl_context_test.cc b/paddle/fluid/imperative/tests/cncl_context_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d5ee8e7fc899f2b5496fd808b39c1bf4be69e73
--- /dev/null
+++ b/paddle/fluid/imperative/tests/cncl_context_test.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>  // NOLINT
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/cncl_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+
+#include "gtest/gtest.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+namespace framework = paddle::framework;
+
+// Node1: FLAGS_selected_mlus=0 PADDLE_TRAINER_ID=0 ./cncl_context_test
+// Node2: FLAGS_selected_mlus=1 PADDLE_TRAINER_ID=1 ./cncl_context_test
+
+int nrings = 1;
+imperative::ParallelStrategy GetStrategy(int local_rank) {
+  std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
+  imperative::ParallelStrategy strategy;
+  strategy.trainer_endpoints_ = eps;
+  strategy.current_endpoint_ = eps[local_rank];
+  strategy.nranks_ = 2;
+  strategy.local_rank_ = local_rank;
+  strategy.nrings_ = nrings;
+  return strategy;
+}
+
+#if defined(PADDLE_WITH_CNCL)
+void Broadcast(int local_rank, int device_id) {
+  int data_size = 4;
+  float test_data = 7;
+  const auto& place = platform::MLUPlace(device_id);
+  platform::MLUDeviceContext ctx(place);
+
+  imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place);
+
+  // init
+  cpc.Init();
+
+  framework::Variable* src_dev_var(new framework::Variable());
+  auto* src_dev_tensor = src_dev_var->GetMutable<framework::LoDTensor>();
+  src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
+
+  // fill data for rank 0 only
+  std::vector<float> src_vec;
+  if (local_rank == 0) {
+    for (int i = 0; i < data_size; ++i) {
+      src_vec.push_back(test_data);
+    }
+    framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
+  }
+  ctx.Wait();
+
+  // call broadcast
+  cpc.Broadcast(src_dev_var, 0);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // check result
+  std::vector<float> dst_vec;
+  framework::TensorToVector(*src_dev_tensor, ctx, &dst_vec);
+  ctx.Wait();
+
+  for (int i = 0; i < data_size; ++i) {
+    EXPECT_EQ(dst_vec[i], test_data);
+  }
+}
+
+TEST(Broadcast, Run) {
+  if (platform::GetMLUDeviceCount() >= 2) {
+    int local_rank = atoi(getenv("PADDLE_TRAINER_ID"));
+    int device_id = atoi(getenv("FLAGS_selected_mlus"));
+    Broadcast(local_rank, device_id);
+  }
+}
+
+void AllReduceByStream(int local_rank, int device_id) {
+  int data_size = 32;
+  const auto& place = platform::MLUPlace(device_id);
+  platform::MLUDeviceContext ctx(place);
+
+  imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place);
+
+  // init
+  cpc.Init();
+
+  // input data
+  framework::Variable* src_dev_var(new framework::Variable());
+  auto* src_dev_tensor = src_dev_var->GetMutable<framework::LoDTensor>();
+  src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
+
+  // fill input data
+  std::vector<float> src_vec;
+  for (int i = 0; i < data_size; ++i) {
+    src_vec.push_back(1.0 + local_rank);
+  }
+  framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
+  ctx.Wait();
+
+  // output data
+  framework::Variable* dst_dev_var(new framework::Variable());
+  auto* dst_dev_tensor = dst_dev_var->GetMutable<framework::LoDTensor>();
+  dst_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
+
+  // call allreduce
+  cpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // check result
+  std::vector<float> dst_vec;
+  framework::TensorToVector(*dst_dev_tensor, ctx, &dst_vec);
+  ctx.Wait();
+
+  EXPECT_EQ(dst_vec.size(), src_vec.size());
+  for (int i = 0; i < data_size; ++i) {
+    EXPECT_EQ(dst_vec[i], 3.0);
+  }
+}
+
+TEST(AllReduceByStream, Run) {
+  if (platform::GetMLUDeviceCount() >= 2) {
+    int local_rank = atoi(getenv("PADDLE_TRAINER_ID"));
+    int device_id = atoi(getenv("FLAGS_selected_mlus"));
+    AllReduceByStream(local_rank, device_id);
+  }
+}
+#endif
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index bca7ecc5d17dc814931e3f81a21d67ec43159355..6c304278d21fde7af093b25cdd8f62a1d4528d31 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -96,7 +96,7 @@ void GroupConcatSplit(Place place, size_t size) {
   {  // concat
     auto* tensor = group.dense_contents_.GetMutable<framework::LoDTensor>();
     tensor->Resize(phi::make_ddim({group.all_length_}))
-        .mutable_data(place, framework::TransToPtenDataType(group.dtype_));
+        .mutable_data(place, framework::TransToPhiDataType(group.dtype_));
     group.ConcatTensors(*dev_ctx);
 
     group.DivNRanks(*dev_ctx, 1);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 1c9cc538ffece6778084075b01d565050e00d71e..85bcbd1458f24a592b646dfcda750f37f113f73f 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -32,6 +32,8 @@ DECLARE_string(tracer_mkldnn_ops_off);
 namespace paddle {
 namespace imperative {
 
+thread_local bool Tracer::enable_program_desc_tracing_ = false;
+
 thread_local bool Tracer::has_grad_ = true;
 
 thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0;
@@ -173,7 +175,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
                      paddle::framework::AttributeMap* passed_default_attrs_,
                      bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
-      type, platform::TracerEventType::Operator, 2);
+      type, platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
@@ -203,17 +205,19 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
 
   NameVarMap<VarType> new_ins = ins;
   if (amp_level_ == AmpLevel::O1) {
-    VLOG(5) << "Auto mixed precision run operator: " << type;
     if (amp_dtype_ == phi::DataType::FLOAT16) {
+      VLOG(5) << "Float16 Auto Mixed Precision O1 run operator: " << type;
       new_ins = AutoCastInputs<VarType>(type, ins);
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
+      VLOG(5) << "BFloat16 Auto Mixed Precision O1 run operator: " << type;
       new_ins = AutoCastBF16Inputs<VarType>(type, ins);
     }
   } else if (amp_level_ == AmpLevel::O2) {
-    VLOG(5) << "Pure fp16 run operator: " << type;
     if (amp_dtype_ == phi::DataType::FLOAT16) {
+      VLOG(5) << "Float16 Auto Mixed Precision O2 run operator: " << type;
       new_ins = CastPureFp16Inputs<VarType>(type, ins);
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
+      VLOG(5) << "BFloat16 Auto Mixed Precision O2 run operator: " << type;
       new_ins = CastPureBf16Inputs<VarType>(type, ins);
     }
   }
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index b508126c367960a842eb9562d42af1de9defade1..73ecbbe6143ca8e68049c2d2886e9eee93b741f1 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -160,10 +160,11 @@ class Tracer {
  private:
   std::unique_ptr<BasicEngine> basic_engine_;
   std::unique_ptr<jit::ProgramDescTracer> program_desc_tracer_;
-  bool enable_program_desc_tracing_{false};
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
   GarbageCollectorMap gcs_;
+
+  static thread_local bool enable_program_desc_tracing_;
   static thread_local bool has_grad_;
   static thread_local AmpLevel amp_level_;
   static thread_local phi::DataType amp_dtype_;
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index d731bfe139bac58050fdf79b420744551bfd17e8..26b8b9e8e17e046964d648f564c26293036e4033 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,7 +35,7 @@ endif()
 
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
+get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
 set(utils_modules stringpiece pretty_log string_helper)
 
 add_subdirectory(api)
@@ -47,12 +47,11 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
+  cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
+elseif(WITH_IPU)
+  cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules} paddle_ipu)
 else()
-  create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
-  if(WITH_IPU)
-    target_link_libraries(paddle_inference -Wl,--allow-multiple-definition popart_canonicalization_utils)
-  endif()
+  create_static_lib(paddle_inference ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
 endif()
 
 if(NOT APPLE)
@@ -82,7 +81,7 @@ set(SHARED_INFERENCE_SRCS
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps
-set(SHARED_INFERENCE_DEPS ${fluid_modules} ${pten_modules} analysis_predictor)
+set(SHARED_INFERENCE_DEPS ${fluid_modules} ${phi_modules} analysis_predictor)
 
 if (WITH_CRYPTO) 
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index f474ccd260e808c2b852eb0443818e8265bb7f7a..a5c32164bf1a28687ea6f8cc53427db67560c307 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -278,10 +278,14 @@ struct Argument {
   // ipu related
   DECL_ARGUMENT_FIELD(use_ipu, UseIpu, bool);
   DECL_ARGUMENT_FIELD(ipu_device_num, IpuDeviceNum, int);
+  DECL_ARGUMENT_FIELD(ipu_micro_batch_size, IpuMicroBatchSize, int);
   DECL_ARGUMENT_FIELD(ipu_enable_pipelining, IpuEnablePipelining, bool);
   DECL_ARGUMENT_FIELD(ipu_batches_per_step, IpuBatchesPerStep, int);
-  DECL_ARGUMENT_FIELD(ipu_batch_size, IpuBatchSize, int);
-  DECL_ARGUMENT_FIELD(ipu_need_avg_shard, IpuNeedAvgShard, bool);
+  DECL_ARGUMENT_FIELD(ipu_enable_fp16, IpuEnableFp16, bool);
+  DECL_ARGUMENT_FIELD(ipu_replica_num, IpuReplicaNum, int);
+  DECL_ARGUMENT_FIELD(ipu_available_memory_proportion,
+                      IpuAvailableMemoryProportion, float);
+  DECL_ARGUMENT_FIELD(ipu_enable_half_partial, IpuEnableHalfPartial, bool);
 
   // npu related
   DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 837b83004de84e6839935835e5b7d4d1e2bc3f45..796c86a3ad1efe45dd8a00139b92c2642676a811 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -54,6 +54,27 @@ void IRPassManager::CreatePasses(Argument *argument,
   int pass_num = 0;
   for (const std::string &pass_name : passes) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+    pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
+    pass->Set("with_interleaved",
+              new bool(argument->tensorrt_with_interleaved()));
+    pass->Set("disable_logs", new bool(argument->disable_logs()));
+    auto precision_mode = argument->tensorrt_precision_mode();
+    bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
+    pass->Set("enable_int8", new bool(enable_int8));
+    pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
+                                     argument->max_input_shape()));
+    pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
+                                     argument->min_input_shape()));
+    pass->Set("optim_input_shape", new std::map<std::string, std::vector<int>>(
+                                       argument->optim_input_shape()));
+    // tuned trt dynamic_shape
+    pass->Set("trt_tuned_dynamic_shape",
+              new bool(argument->tensorrt_tuned_dynamic_shape()));
+    bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
+                               argument->min_input_shape().size() > 0 &&
+                               argument->optim_input_shape().size() > 0) ||
+                              argument->tensorrt_tuned_dynamic_shape();
+    pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
 
     if (pass_name == "graph_viz_pass") {
       std::string optim_cache_dir = argument->optim_cache_dir();
@@ -99,17 +120,9 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->tensorrt_min_subgraph_size()));
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
-
-      auto precision_mode = argument->tensorrt_precision_mode();
-      bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
-
       pass->Set("predictor_id", new int(argument->predictor_id()));
       bool use_calib_mode = argument->tensorrt_use_calib_mode();
-      pass->Set("enable_int8", new bool(enable_int8));
       pass->Set("use_calib_mode", new bool(use_calib_mode));
-      pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
-      pass->Set("with_interleaved",
-                new bool(argument->tensorrt_with_interleaved()));
       pass->Set("precision_mode",
                 new AnalysisConfig::Precision(precision_mode));
 
@@ -161,22 +174,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       // tuned trt dynamic_shape
       pass->Set("trt_shape_range_info_path",
                 new std::string(argument->tensorrt_shape_range_info_path()));
-      pass->Set("trt_tuned_dynamic_shape",
-                new bool(argument->tensorrt_tuned_dynamic_shape()));
       pass->Set("trt_allow_build_at_runtime",
                 new bool(argument->tensorrt_allow_build_at_runtime()));
-      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
-                                       argument->max_input_shape()));
-      pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
-                                       argument->min_input_shape()));
-      pass->Set("optim_input_shape",
-                new std::map<std::string, std::vector<int>>(
-                    argument->optim_input_shape()));
-      bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
-                                 argument->min_input_shape().size() > 0 &&
-                                 argument->optim_input_shape().size() > 0) ||
-                                argument->tensorrt_tuned_dynamic_shape();
-      pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
       pass->Set("trt_disabled_ops", new std::vector<std::string>(
                                         argument->tensorrt_disabled_ops()));
       pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
@@ -192,14 +191,15 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new framework::ProgramDesc *(&argument->main_program()));
     }
     if (pass_name == "lite_subgraph_pass") {
-      bool enable_int8 =
+      bool lite_enable_int8 =
           argument->lite_precision_mode() == AnalysisConfig::Precision::kInt8;
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
       pass->Set("lite_ops_filter",
                 new std::vector<std::string>(argument->lite_ops_filter()));
       pass->Set("predictor_id", new int(argument->predictor_id()));
-      pass->Set("enable_int8", new bool(enable_int8));
+      pass->Erase("enable_int8");
+      pass->Set("enable_int8", new bool(lite_enable_int8));
       pass->Set("use_gpu", new bool(argument->use_gpu()));
       pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
       pass->Set("use_xpu", new bool(argument->use_xpu()));
@@ -236,7 +236,6 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new std::vector<std::string>(
                     argument->nnadapter_model_cache_token()));
     }
-    disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
       pass->Set("use_gpu", new bool(argument->use_gpu()));
       bool fc_mkldnn_pass = 0;
@@ -248,9 +247,6 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
       pass->Set("use_fc_padding", new bool(use_fc_padding));
     }
-
-    pass->Set("disable_logs", new bool(disable_logs_));
-
     pre_pass = pass_name;
 
     passes_.emplace_back(std::move(pass));
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index fe6a27f80725f8e6520c0988f195419eb8a0cc1d..321716b1c8a1849c394850a874cd5d20e88c4a9a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -72,17 +72,21 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
     if (argument->use_ipu()) {
       argument->main_graph().SetNotOwned("num_ipus",
                                          &argument->ipu_device_num());
-      argument->main_graph().SetNotOwned("need_avg_shard",
-                                         &argument->ipu_need_avg_shard());
+      argument->main_graph().SetNotOwned("micro_batch_size",
+                                         &argument->ipu_micro_batch_size());
       argument->main_graph().SetNotOwned("enable_pipelining",
                                          &argument->ipu_enable_pipelining());
       argument->main_graph().SetNotOwned("batches_per_step",
                                          &argument->ipu_batches_per_step());
-      argument->main_graph().SetNotOwned("batch_size",
-                                         &argument->ipu_batch_size());
-    } else {
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Please compile with WITH_IPU"));
+      argument->main_graph().SetNotOwned("enable_fp16",
+                                         &argument->ipu_enable_fp16());
+      argument->main_graph().SetNotOwned("replica_num",
+                                         &argument->ipu_replica_num());
+      argument->main_graph().SetNotOwned(
+          "available_memory_proportion",
+          &argument->ipu_available_memory_proportion());
+      argument->main_graph().SetNotOwned("enable_half_partial",
+                                         &argument->ipu_enable_half_partial());
     }
   }
 #endif
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 6c465e62780593c043f844e2738132c404e280b5..87efe5ec5190372b48f1bd6387e1c92f456865a1 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -56,8 +56,10 @@ cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
 
 if(WITH_TESTING)
   if (NOT APPLE AND NOT WIN32)
-    inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared
-      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
+    if (WITH_GPU)
+      inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared
+        ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
+    endif()
   elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 57e49733b329aab8d73ab5e39c594711d5a416a9..fd2ccffae3b4af3280f622722d6080d7c68bfbad 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -142,17 +142,28 @@ void AnalysisConfig::EnableNpu(int device_id) {
 
   Update();
 }
-void AnalysisConfig::EnableIpu(int device_num, bool ipu_enable_pipelining,
-                               int ipu_batches_per_step, int ipu_batch_size,
-                               bool ipu_need_avg_shard) {
+
+void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size,
+                               bool ipu_enable_pipelining,
+                               int ipu_batches_per_step) {
   enable_ir_optim_ = true;
 
   use_ipu_ = true;
-  ipu_device_num_ = device_num;
+  ipu_device_num_ = ipu_device_num;
+  ipu_micro_batch_size_ = ipu_micro_batch_size;
   ipu_enable_pipelining_ = ipu_enable_pipelining;
   ipu_batches_per_step_ = ipu_batches_per_step;
-  ipu_batch_size_ = ipu_batch_size;
-  ipu_need_avg_shard_ = ipu_need_avg_shard;
+
+  Update();
+}
+
+void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num,
+                                  float ipu_available_memory_proportion,
+                                  bool ipu_enable_half_partial) {
+  ipu_enable_fp16_ = ipu_enable_fp16;
+  ipu_replica_num_ = ipu_replica_num;
+  ipu_available_memory_proportion_ = ipu_available_memory_proportion;
+  ipu_enable_half_partial_ = ipu_enable_half_partial;
 
   Update();
 }
@@ -255,10 +266,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // ipu related
   CP_MEMBER(use_ipu_);
   CP_MEMBER(ipu_device_num_);
+  CP_MEMBER(ipu_micro_batch_size_);
   CP_MEMBER(ipu_enable_pipelining_);
   CP_MEMBER(ipu_batches_per_step_);
-  CP_MEMBER(ipu_batch_size_);
-  CP_MEMBER(ipu_need_avg_shard_);
+  CP_MEMBER(ipu_enable_fp16_);
+  CP_MEMBER(ipu_replica_num_);
+  CP_MEMBER(ipu_available_memory_proportion_);
+  CP_MEMBER(ipu_enable_half_partial_);
 
   if (use_gpu_) {
     PADDLE_ENFORCE_EQ(use_xpu_, false,
@@ -684,10 +698,13 @@ std::string AnalysisConfig::SerializeInfoCache() {
 
   ss << use_ipu_;
   ss << ipu_device_num_;
+  ss << ipu_micro_batch_size_;
   ss << ipu_enable_pipelining_;
   ss << ipu_batches_per_step_;
-  ss << ipu_batch_size_;
-  ss << ipu_need_avg_shard_;
+  ss << ipu_enable_fp16_;
+  ss << ipu_replica_num_;
+  ss << ipu_available_memory_proportion_;
+  ss << ipu_enable_half_partial_;
 
   return ss.str();
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index a3812244fbe224982063e6000924cb670e67b85b..cd6e3a3c759c05bda34978dd78d07358aacd53fe 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -93,6 +93,8 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
     input_ptr = t->mutable_data<float>(ddim, place);
   } else if (pt.dtype == PaddleDType::INT32) {
     input_ptr = t->mutable_data<int32_t>(ddim, place);
+  } else if (pt.dtype == PaddleDType::FLOAT16) {
+    input_ptr = t->mutable_data<float16>(ddim, place);
   } else {
     LOG(ERROR) << "unsupported feed type " << pt.dtype;
     return false;
@@ -563,8 +565,12 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
     } else if (type == framework::proto::VarType::INT32) {
       GetFetchOne<int32_t>(fetch, output);
       output->dtype = PaddleDType::INT32;
+    } else if (type == framework::proto::VarType::FP16) {
+      GetFetchOne<float16>(fetch, output);
+      output->dtype = PaddleDType::FLOAT16;
     } else {
-      LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
+      LOG(ERROR) << "unknown type, only support float32, float16, int64 and "
+                    "int32 now.";
     }
   }
   return true;
@@ -592,6 +598,14 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetModelParamsPath(config_.params_file());
   }
 
+  argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
+  argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
+  argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
+  argument_.SetMinInputShape(config_.min_input_shape_);
+  argument_.SetMaxInputShape(config_.max_input_shape_);
+  argument_.SetOptimInputShape(config_.optim_input_shape_);
+  argument_.SetTensorRtTunedDynamicShape(
+      config_.tuned_tensorrt_dynamic_shape());
   if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
     LOG(INFO) << "TensorRT subgraph engine is enabled";
     argument_.SetUseTensorRT(true);
@@ -601,18 +615,10 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetTensorRtDisabledOPs(config_.trt_disabled_ops_);
     argument_.SetTensorRtUseDLA(config_.trt_use_dla_);
     argument_.SetTensorRtDLACore(config_.trt_dla_core_);
-    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
     argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
     argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
-    argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
-    argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
-    argument_.SetMinInputShape(config_.min_input_shape_);
-    argument_.SetMaxInputShape(config_.max_input_shape_);
-    argument_.SetOptimInputShape(config_.optim_input_shape_);
     argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
     argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
-    argument_.SetTensorRtTunedDynamicShape(
-        config_.tuned_tensorrt_dynamic_shape());
     argument_.SetTensorRtAllowBuildAtRuntime(
         config_.trt_allow_build_at_runtime());
     argument_.SetTensorRtUseInspector(config_.trt_use_inspector_);
@@ -662,12 +668,18 @@ void AnalysisPredictor::PrepareArgument() {
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
+#ifdef PADDLE_WITH_IPU
   argument_.SetUseIpu(config_.use_ipu_);
   argument_.SetIpuDeviceNum(config_.ipu_device_num());
+  argument_.SetIpuMicroBatchSize(config_.ipu_micro_batch_size_);
   argument_.SetIpuEnablePipelining(config_.ipu_enable_pipelining_);
   argument_.SetIpuBatchesPerStep(config_.ipu_batches_per_step_);
-  argument_.SetIpuBatchSize(config_.ipu_batch_size_);
-  argument_.SetIpuNeedAvgShard(config_.ipu_need_avg_shard_);
+  argument_.SetIpuEnableFp16(config_.ipu_enable_fp16_);
+  argument_.SetIpuReplicaNum(config_.ipu_replica_num_);
+  argument_.SetIpuAvailableMemoryProportion(
+      config_.ipu_available_memory_proportion_);
+  argument_.SetIpuEnableHalfPartial(config_.ipu_enable_half_partial_);
+#endif
 
   argument_.SetUseNpu(config_.use_npu_);
   argument_.SetNPUDeviceId(config_.npu_device_id());
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 4b13ca073bc4f77756112322700ad5ad6d9d7fa4..180c028c6a61088edeb8723891d4de1ba2272b80 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -234,20 +234,30 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   /// \brief Turn on IPU.
   ///
-  /// \param device_num The number of IPUs.
-  /// \param ipu_enable_pipelining Enable data pipelining between subgraphs,
-  /// each subgraph is settled on an IPU. (This feature requires the number of
-  /// IPUs > 1.)
-  /// \param ipu_batches_per_step The number of micro_batch_size per run. (This
-  /// feature requires to enable pipelining.)
-  /// \param ipu_batch_size The micro_batch_size which is the batch_size in the
-  /// graph.
-  /// \param ipu_need_avg_shard Enable the auto graph sharding. (This feature
-  /// requires the number of IPUs > 1.)
-  ///
-  void EnableIpu(int device_num = 1, bool ipu_enable_pipelining = false,
-                 int ipu_batches_per_step = 1, int ipu_batch_size = 1,
-                 bool ipu_need_avg_shard = false);
+  /// \param ipu_device_num the number of IPUs.
+  /// \param ipu_micro_batch_size the batch size in the graph, only work with
+  /// mutable input shapes.
+  /// \param ipu_enable_pipelining enable pipelining.
+  /// \param ipu_batches_per_step the number of batches per run in pipelining.
+  ///
+  void EnableIpu(int ipu_device_num = 1, int ipu_micro_batch_size = 1,
+                 bool ipu_enable_pipelining = false,
+                 int ipu_batches_per_step = 1);
+
+  ///
+  /// \brief Set IPU config.
+  ///
+  /// \param ipu_enable_fp16 enable fp16.
+  /// \param ipu_replica_num the number of graph replication.
+  /// \param ipu_available_memory_proportion the available memory proportion for
+  /// matmul/conv.
+  /// \param ipu_enable_half_partial enable fp16 partial for matmul, only work
+  /// with fp16.
+  ///
+  void SetIpuConfig(bool ipu_enable_fp16 = false, int ipu_replica_num = 1,
+                    float ipu_available_memory_proportion = 1.0,
+                    bool ipu_enable_half_partial = false);
+
   ///
   /// \brief Set XPU device id.
   ///
@@ -876,11 +886,14 @@ struct PD_INFER_DECL AnalysisConfig {
   // ipu related.
   bool use_ipu_{false};
   int ipu_device_num_{1};
-
+  int ipu_micro_batch_size_{1};
   bool ipu_enable_pipelining_{false};
   int ipu_batches_per_step_{1};
-  int ipu_batch_size_{1};
-  bool ipu_need_avg_shard_{false};
+
+  bool ipu_enable_fp16_{false};
+  int ipu_replica_num_{1};
+  float ipu_available_memory_proportion_{1.0};
+  bool ipu_enable_half_partial_{false};
 
   // If the config is already used on a predictor, it becomes invalid.
   // Any config can only be used with one predictor.
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 24a72a0b9dadbd8123876cd8a91dccb22e1c8de2..81eecbb2c1480499b81556c48d021a8ff8929899 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -45,7 +45,7 @@ enum DataType {
   // TODO(Superjomn) support more data types if needed.
 };
 
-enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU };
+enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU };
 
 /// \brief Represents an n-dimensional array of values.
 /// The Tensor is used to store the input or output of the network.
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 8d3e091dbf5abeff5e32571666e76d50bf91941e..e8e9d895b4e8fb982ccb667352fd6c26228782a5 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/pybind.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 04ae3b9afe32c1762399e987ac5be8bc312d4d59..eeaa128290339ce8c2ac6961c575d64abaa3c1db 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -38,8 +38,6 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
     dst->emplace_back(v);
   }
 }
-template void SetLoD<paddle::lite::LoD, framework::LoD>(
-    paddle::lite::LoD* dst, const framework::LoD& src);
 template void SetLoD<framework::LoD, paddle::lite::LoD>(
     framework::LoD* dst, const paddle::lite::LoD& src);
 
@@ -200,7 +198,7 @@ void InitDstTensor(framework::LoDTensor* dst,
                    const paddle::lite_api::Tensor& src) {
   dst->mutable_data(
       inference::lite::utils::GetNativePlace(src.target()),
-      framework::TransToPtenDataType(GetNativePrecisionType(src.precision())));
+      framework::TransToPhiDataType(GetNativePrecisionType(src.precision())));
   SetLoD(dst->mutable_lod(), src.lod());
 }
 
@@ -271,7 +269,7 @@ void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
   SetLoD(dst->mutable_lod(), src->lod());
   dst->ResetHolderWithType(
       holder,
-      framework::TransToPtenDataType(GetNativePrecisionType(src->precision())));
+      framework::TransToPhiDataType(GetNativePrecisionType(src->precision())));
 }
 
 }  // namespace utils
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
index a58de101053b3847db063bef5b5870992676b124..daa3b186ab4c4ca95d17d1bbd26a8cf32b4f4416 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -51,21 +51,11 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     auto pos_emb_name = op_desc.Input("PosEmbedding").front();
     auto sent_emb_name = op_desc.Input("SentEmbedding").front();
 
-    std::vector<std::string> id_names;
     std::vector<std::string> emb_names;
-
-    id_names =
-        std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
     emb_names =
         std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
 
-    int input_num = id_names.size();
-
-    // Declare inputs
-    std::vector<nvinfer1::ITensor*> input_ids;
-    for (int i = 0; i < input_num; i++) {
-      input_ids.push_back(engine_->GetITensor(id_names[i]));
-    }
+    int input_num = emb_names.size();
 
     // input_embs[0]: word_embedding
     // input_embs[1]: pos_embedding
@@ -126,7 +116,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
         {"bert_embeddings_position_embeddings", input_embs[1],
          nvinfer1::PluginFieldType::kFLOAT32,
          static_cast<int32_t>(emb_sizes[1])},
-        {"output_int8", &output_int8, nvinfer1::PluginFieldType::kINT32, 1},
+        {"output_fp16", &output_int8, nvinfer1::PluginFieldType::kINT32, 1},
     };
 
     nvinfer1::PluginFieldCollection* plugin_ptr =
@@ -156,7 +146,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     shuffle_layer->setReshapeDimensions(shape_dim);
     shuffle_layer->setName(
         ("PrelnEmbeltwise_Shuffle_reshape (Output: max_seqlen " +
-         op_desc.Output("Out")[0] + ")")
+         op_desc.Output("Out_0")[0] + ")")
             .c_str());
     engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
     plugin_inputs.emplace_back(
@@ -170,7 +160,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     auto plugin_layer = engine_->network()->addPluginV2(
         plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
     plugin_layer->setName(("CustomPrelnEmbLayerNormPluginDynamic_V3(Output: " +
-                           op_desc.Output("Out")[0] + ")")
+                           op_desc.Output("Out_0")[0] + ")")
                               .c_str());
     free(plugin_ptr);
     float out_0_scale =
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
index 521e04b8974fd5a761b337ecc618cf061b90a79a..d9eca65fc45dcd44725c79aaa07e1d618a15a539 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
@@ -92,8 +92,10 @@ class PrelnSkipLayerNormOpConverter : public OpConverter {
             "fail to add CustomPrelnSkipLayerNormPluginDynamic layer"));
     layer = plugin_layer;
 
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_name},
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Out_0")[0]);
+    output_names.push_back(op_desc.Output("Out_1")[0]);
+    RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_names},
                              test_mode);
 #else
     PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 9cefb24751e18dfbb3b8283152cbcd58c81adc58..46e6c18bfb8e31ee3b8bd8f225ebe15443eb9efc 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -88,5 +88,5 @@ class SoftMaxOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
index b6fdcddf309d85a68ea67f33c157fbcf5ce5affc..9cd5e81141598dda6ead275457c53feeb84c5fb8 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
@@ -45,4 +45,4 @@ TEST(SoftMaxOpConverter, main) {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 436c80d9a6bcf27ad00451642119c54760029669..7ddd4b558228b8577554352089aab1a9b62e16b0 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -560,12 +560,14 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                      "the pass.";
           return false;
         }
+#if !IS_TRT_VERSION_GE(7000)
         auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
         const auto x_shape = x_var_desc->GetShape();
         if (x_shape.size() == 1) {
           VLOG(3) << "Gather does not support 1-dimensional input in tensorrt";
           return false;
         }
+#endif
       }
     }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
index ecf06e9bf15139990d5746a11592816ecde9f9f9..324e9c0392c9397837e05392bd7b0f755e6e14bf 100644
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -113,12 +113,12 @@ nvinfer1::DataType SpecialSlicePluginDynamic::getOutputDataType(
 template <typename T>
 __global__ void SpecialSliceKernel(const T* slice_input,
                                    const int32_t* cu_seqlens, T* output) {
-  const int hidden = blockDim.x * gridDim.y;
-  const int batch = blockIdx.x;
-  const int local_idx = blockIdx.y * blockDim.y + threadIdx.x;
+  const int hidden = blockDim.x * gridDim.x;
+  const int hidden_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int batch_id = blockIdx.y;
 
-  output[batch * hidden + local_idx] =
-      slice_input[cu_seqlens[batch] * hidden + local_idx];
+  output[batch_id * hidden + hidden_id] =
+      slice_input[cu_seqlens[batch_id] * hidden + hidden_id];
 }
 
 int SpecialSlicePluginDynamic::enqueue(
@@ -137,15 +137,16 @@ int SpecialSlicePluginDynamic::enqueue(
                                          "hidden should be multiple of 128."));
 
   constexpr int num_threads = 128;
-  const dim3 blocks(out_dims.d[0], hidden / num_threads);
-
   const half* slice_input = static_cast<const half*>(inputs[0]);
   const int32_t* cu_seqlens = static_cast<const int32_t*>(inputs[1]);
   half* output = static_cast<half*>(outputs[0]);
 
-  SpecialSliceKernel<<<blocks, num_threads, 0, stream>>>(slice_input,
-                                                         cu_seqlens, output);
+  const int32_t num_blocks_x = hidden / num_threads;
+  const int32_t num_blocks_y = out_dims.d[0];         // batchs
+  const dim3 num_blocks(num_blocks_x, num_blocks_y);  // blocks
 
+  SpecialSliceKernel<<<num_blocks, num_threads, 0, stream>>>(
+      slice_input, cu_seqlens, output);
   return cudaGetLastError() != cudaSuccess;
 }
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 9dafd0d17c7157c0e351b67d0a01fccccbdbc47a..37214534f3c937bcf62bb34b51da2c934c566ced 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -299,7 +299,9 @@ inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR}
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
 download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1)
 download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62)
-inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc)
+if (WITH_GPU)
+    inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc)
+endif()
 inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc)
 
 # Ernie large
@@ -551,7 +553,9 @@ endif()
 # bert, max_len=20, embedding_dim=128
 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
 download_model_and_data_without_verify(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
+if (WITH_GPU)
+    inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
+endif()
 
 # multiple models prediction
 set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction")
@@ -741,13 +745,15 @@ set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_mobilenet_transpose PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_ner PROPERTIES TIMEOUT 120)
-set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_ernie_int8 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_googlenet PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_small_dam PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120)
-set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT 120)
+if (WITH_GPU)
+    set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120)
+endif()
 if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
     if(WITH_MKLDNN)
@@ -758,11 +764,30 @@ if(ON_INFER OR WITH_GPU)
     set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)
 endif()
 
-# IPU
 if (WITH_IPU)
-    #resnet50
+    #word2vec sample
+    set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec/word2vec.inference.model")
+    inference_analysis_test(ipu_word2vec_sample SRCS ipu_word2vec_sample.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${WORD2VEC_INSTALL_DIR})
+
+    # ERNIE
+    set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
+    inference_analysis_api_test(ipu_ernie_test ${ERNIE_INSTALL_DIR} ipu_ernie_test.cc
+            ARGS --warmup=true --repeat=10)
+    inference_analysis_api_test(ipu_ernie_fp16_test ${ERNIE_INSTALL_DIR} ipu_ernie_fp16_test.cc
+            ARGS --warmup=true --repeat=10)
+
+    # Resnet50
     set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
     inference_analysis_test(ipu_resnet50_test SRCS ipu_resnet50_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=1000)
+        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
+    inference_analysis_test(ipu_resnet50_fp16_test SRCS ipu_resnet50_fp16_test.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
+
+    # Only support Resnet50 and Ernie currently
+    inference_analysis_api_test(ipu_multi_model_profile SRCS ipu_multi_model_profile.cc
+        ARGS --model_name="Resnet50" --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
index 2582a1cb09eef02272f441376cec73b196142f10..fffcd38d95a0c06ed375438c2fb9d201ce7b2a7f 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
@@ -150,8 +150,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false,
 
 void SetIpuConfig(AnalysisConfig *cfg, int batch_size = 1) {
   cfg->SetModel(FLAGS_infer_model);
-  // num_ipu, enable_pipelining, batches_per_step, batch_size, need_avg_shard
-  cfg->EnableIpu(4, false, 1, batch_size, true);
+  cfg->EnableIpu(4, batch_size, false, 1);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc b/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa775bd9a9cb99c2566133f474a8bc529336477e
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  // fp32 to fp16
+  ConvertFP32toFP16(input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
+                   int batch_size = 1) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+
+void SetConfig(AnalysisConfig *cfg, int batch_size = 1) {
+  cfg->SetModel(FLAGS_infer_model);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  cfg->EnableIpu(1, batch_size, false);
+  // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion,
+  // ipu_enable_half_partial
+  cfg->SetIpuConfig(true, 1, 1.0, true);
+}
+
+// Compare results
+TEST(Analyzer_Ernie_ipu, compare_results) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  auto predictor = CreateTestPredictor(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+      FLAGS_use_analysis);
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+
+    auto output = outputs.front();
+    ConvertFP16toFP32(output);
+    auto outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *fp32_data = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], fp32_data[j], 5e-3);
+    }
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/ipu_ernie_test.cc b/paddle/fluid/inference/tests/api/ipu_ernie_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e36917c9acd3eb56f6a5004d092c3d6839ceb101
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_ernie_test.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
+                   int batch_size = 1) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+
+void SetConfig(AnalysisConfig *cfg, int batch_size = 1) {
+  cfg->SetModel(FLAGS_infer_model);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  cfg->EnableIpu(1, batch_size, false);
+}
+
+void profile() {
+  AnalysisConfig config;
+  SetConfig(&config);
+
+  std::vector<std::vector<PaddleTensor>> outputs;
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
+                 inputs, &outputs, FLAGS_num_threads);
+}
+
+// Compare Deterministic result
+TEST(Analyzer_Ernie_ipu, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
+// Compare results
+TEST(Analyzer_Ernie_ipu, compare_results) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  auto predictor = CreateTestPredictor(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+      FLAGS_use_analysis);
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+    auto outputs_size = outputs.front().data.length() / (sizeof(float));
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j],
+                  static_cast<float *>(outputs[0].data.data())[j],
+                  FLAGS_accuracy);
+    }
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc b/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a225feae4a2619a7c9e26e5dd0ab4bfa584b1938
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void ErnieInputData(const int &total_batch_size, const bool enable_fp16,
+                    std::vector<PaddleTensor> *inputs) {
+  const int input_num = total_batch_size * 128 * 1;
+  std::vector<int64_t> placeholder_012(input_num, 1);
+  std::vector<float> placeholder_3(input_num, 1);
+
+  for (int i = 0; i < 4; i++) {
+    PaddleTensor in;
+    in.name = "placeholder_" + std::to_string(i);
+    in.shape = {total_batch_size, 128, 1};
+    if (i < 3) {
+      in.data = PaddleBuf(static_cast<void *>(placeholder_012.data()),
+                          input_num * sizeof(int64_t));
+      in.dtype = PaddleDType::INT64;
+    } else {
+      in.data = PaddleBuf(static_cast<void *>(placeholder_3.data()),
+                          input_num * sizeof(float));
+      in.dtype = PaddleDType::FLOAT32;
+      if (enable_fp16) {
+        ConvertFP32toFP16(in);
+      }
+    }
+    inputs->push_back(std::move(in));
+  }
+}
+
+void Resnet50InputData(const int &total_batch_size, const bool enable_fp16,
+                       std::vector<paddle::PaddleTensor> *inputs) {
+  const int input_num = total_batch_size * 3 * 318 * 318;
+  std::vector<float> input(input_num, 1);
+  PaddleTensor in;
+  in.shape = {total_batch_size, 3, 318, 318};
+  in.data =
+      PaddleBuf(static_cast<void *>(input.data()), input_num * sizeof(float));
+  in.dtype = PaddleDType::FLOAT32;
+  if (enable_fp16) {
+    ConvertFP32toFP16(in);
+  }
+  inputs->push_back(std::move(in));
+}
+
+// performance profile
+TEST(Analyzer_ipu_fp16, performance_profile) {
+  AnalysisConfig config;
+  std::vector<PaddleTensor> inputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
+
+  int total_batch_size = FLAGS_ipu_micro_batch_size * FLAGS_ipu_replica_num;
+  if (FLAGS_ipu_enable_pipelining) {
+    // if device_num > 1 and pipelining is enabled, the total batch size =
+    // micro_batch_size * device_num(batches_per_step) * replica_num
+    total_batch_size = FLAGS_ipu_micro_batch_size * FLAGS_ipu_batches_per_step *
+                       FLAGS_ipu_replica_num;
+  }
+
+  if (FLAGS_model_name == "Resnet50") {
+    config.SetModel(FLAGS_infer_model + "/model/model",
+                    FLAGS_infer_model + "/model/params");
+    Resnet50InputData(total_batch_size, FLAGS_ipu_enable_fp16, &inputs);
+  } else if (FLAGS_model_name == "Ernie") {
+    config.SetModel(FLAGS_infer_model + "/model/");
+    ErnieInputData(total_batch_size, FLAGS_ipu_enable_fp16, &inputs);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Only support Resnet50 and Ernie Currently"));
+  }
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining,
+  // ipu_batches_per_step
+  config.EnableIpu(FLAGS_ipu_device_num, FLAGS_ipu_micro_batch_size,
+                   FLAGS_ipu_enable_pipelining, FLAGS_ipu_batches_per_step);
+  // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion,
+  // ipu_enable_half_partial
+  config.SetIpuConfig(FLAGS_ipu_enable_fp16, FLAGS_ipu_replica_num,
+                      FLAGS_ipu_available_memory_proportion,
+                      FLAGS_ipu_enable_half_partial);
+
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
+                 {inputs}, &outputs, 1);
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d69069da0716017a8dd4ce62fbe2a083516a40c
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cmath>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+// Compare results with 1 batch
+TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  AnalysisConfig config;
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  config.EnableIpu(1, 1, false);
+  // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion,
+  // ipu_enable_half_partial
+  config.SetIpuConfig(true, 1, 1.0, true);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+
+  std::vector<PaddleTensor> inputs;
+  auto predictor = CreatePaddlePredictor(config);
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+
+  PaddleTensor in;
+  in.shape = {batch, channel, height, width};
+  in.data =
+      PaddleBuf(static_cast<void*>(input.data()), input_num * sizeof(float));
+  in.dtype = PaddleDType::FLOAT32;
+  ConvertFP32toFP16(in);
+  inputs.emplace_back(in);
+
+  std::vector<PaddleTensor> outputs;
+
+  ASSERT_TRUE(predictor->Run(inputs, &outputs));
+
+  const std::vector<float> truth_values = {
+      127.779f,  738.165f,  1013.22f,  -438.17f,  366.401f,  927.659f,
+      736.222f,  -633.684f, -329.927f, -430.155f, -633.062f, -146.548f,
+      -1324.28f, -1349.36f, -242.675f, 117.448f,  -801.723f, -391.514f,
+      -404.818f, 454.16f,   515.48f,   -133.031f, 69.293f,   590.096f,
+      -1434.69f, -1070.89f, 307.074f,  400.525f,  -316.12f,  -587.125f,
+      -161.056f, 800.363f,  -96.4708f, 748.706f,  868.174f,  -447.938f,
+      112.737f,  1127.2f,   47.4355f,  677.72f,   593.186f,  -336.4f,
+      551.362f,  397.823f,  78.3979f,  -715.398f, 405.969f,  404.256f,
+      246.019f,  -8.42969f, 131.365f,  -648.051f};
+
+  const size_t expected_size = 1;
+  EXPECT_EQ(outputs.size(), expected_size);
+
+  auto output = outputs.front();
+  ConvertFP16toFP32(output);
+  auto outputs_size = 1;
+  for (auto dim : output.shape) {
+    outputs_size *= dim;
+  }
+  float* fp32_data = reinterpret_cast<float*>(output.data.data());
+
+  for (size_t j = 0; j < outputs_size; j += 10) {
+    EXPECT_NEAR((fp32_data[j] - truth_values[j / 10]) / truth_values[j / 10],
+                0., 9e-2);
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
index f5e755ab466915d03d799e565a14107ff2f62f23..5fde8e6a5e1e676d5dacfb9c4c0c1d876130844b 100644
--- a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
@@ -33,9 +33,8 @@ static std::vector<float> truth_values = {
 TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
   std::string model_dir = FLAGS_infer_model + "/" + "model";
   AnalysisConfig config;
-  // num_ipu, enable_pipelining, batches_per_step, batch_size,
-  // need_avg_shard
-  config.EnableIpu(1, false);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  config.EnableIpu(1, 1, false);
   config.SetModel(model_dir + "/model", model_dir + "/params");
 
   std::vector<PaddleTensor> inputs;
@@ -72,9 +71,8 @@ TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
 TEST(Analyzer_Resnet50_ipu, compare_results_2_batch) {
   std::string model_dir = FLAGS_infer_model + "/" + "model";
   AnalysisConfig config;
-  // num_ipu, enable_pipelining, batches_per_step, batch_size,
-  // need_avg_shard
-  config.EnableIpu(2, false, 1, 2, 1);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  config.EnableIpu(1, 2, false);
   config.SetModel(model_dir + "/model", model_dir + "/params");
 
   std::vector<PaddleTensor> inputs;
diff --git a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d38c5c3416351ae6b55d3e5ea8632290e8e202a7
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference with
+ * IPUs.
+ * Model: wget -q
+ * http://paddle-inference-dist.bj.bcebos.com/word2vec.inference.model.tar.gz
+ */
+
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(infer_model, "", "Directory of the inference model.");
+
+using paddle_infer::Config;
+using paddle_infer::Predictor;
+using paddle_infer::CreatePredictor;
+
+void inference(std::string model_path, bool use_ipu,
+               std::vector<float> *out_data) {
+  //# 1. Create Predictor with a config.
+  Config config;
+  config.SetModel(FLAGS_infer_model);
+  if (use_ipu) {
+    // ipu_device_num, ipu_micro_batch_size
+    config.EnableIpu(1, 4);
+  }
+  auto predictor = CreatePredictor(config);
+
+  //# 2. Prepare input/output tensor.
+  auto input_names = predictor->GetInputNames();
+  std::vector<int64_t> data{1, 2, 3, 4};
+  // For simplicity, we set all the slots with the same data.
+  for (auto input_name : input_names) {
+    auto input_tensor = predictor->GetInputHandle(input_name);
+    input_tensor->Reshape({4, 1});
+    input_tensor->CopyFromCpu(data.data());
+  }
+
+  //# 3. Run
+  predictor->Run();
+
+  //# 4. Get output.
+  auto output_names = predictor->GetOutputNames();
+  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_tensor->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_tensor->CopyToCpu(out_data->data());
+}
+
+int main(int argc, char *argv[]) {
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  std::vector<float> ipu_result;
+  std::vector<float> cpu_result;
+  inference(FLAGS_infer_model, true, &ipu_result);
+  inference(FLAGS_infer_model, false, &cpu_result);
+  for (size_t i = 0; i < ipu_result.size(); i++) {
+    CHECK_NEAR(ipu_result[i], cpu_result[i], 1e-6);
+  }
+  LOG(INFO) << "Finished";
+}
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 77fab0a86f83300b89d75ae0cd1ce7fa5bf03a5a..637fa16e31ba7996713a6971c3a1802627811e7f 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -76,10 +76,23 @@ DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance.");
 DEFINE_bool(fuse_multi_gru, false,
             "Running the inference program with multi_gru_fuse_pass");
 
+// ipu related
+DEFINE_int32(ipu_micro_batch_size, 1, "micro batch size");
+DEFINE_int32(ipu_device_num, 1, "device num");
+DEFINE_bool(ipu_enable_pipelining, false, "enable pipelining");
+DEFINE_int32(ipu_batches_per_step, 1,
+             "the number of batches per run in pipelining");
+DEFINE_bool(ipu_enable_fp16, false, "enable fp16");
+DEFINE_int32(ipu_replica_num, 1, "replica num");
+DEFINE_double(ipu_available_memory_proportion, 1.0,
+              "available memory proportion");
+DEFINE_bool(ipu_enable_half_partial, false, "enable half partial");
+
 namespace paddle {
 namespace inference {
 
 using paddle::framework::proto::VarType;
+using float16 = paddle::platform::float16;
 
 template <typename T>
 constexpr paddle::PaddleDType GetPaddleDType();
@@ -1060,5 +1073,44 @@ static bool CompareTensor(const framework::LoDTensor &a,
   return true;
 }
 
+void ConvertFP32toFP16(paddle::PaddleTensor &tensor  // NOLINT
+                       ) {
+  int num = 1;
+  for (auto dim : tensor.shape) {
+    num *= dim;
+  }
+  PADDLE_ENFORCE_EQ(
+      tensor.dtype, PaddleDType::FLOAT32,
+      platform::errors::InvalidArgument(
+          "The tensor dtype is not float32, only support float32 as input"));
+  float *fp32_data = reinterpret_cast<float *>(tensor.data.data());
+  float16 *fp16_data = new float16[num];
+  for (int i = 0; i < num; i++) {
+    fp16_data[i] = float16(fp32_data[i]);
+  }
+  tensor.data =
+      PaddleBuf(static_cast<void *>(fp16_data), num * sizeof(float16));
+  tensor.dtype = PaddleDType::FLOAT16;
+}
+
+void ConvertFP16toFP32(paddle::PaddleTensor &tensor  // NOLINT
+                       ) {
+  int num = 1;
+  for (auto dim : tensor.shape) {
+    num *= dim;
+  }
+  PADDLE_ENFORCE_EQ(
+      tensor.dtype, PaddleDType::FLOAT16,
+      platform::errors::InvalidArgument(
+          "The tensor dtype is not float16, only support float16 as input"));
+  float16 *fp16_data = reinterpret_cast<float16 *>(tensor.data.data());
+  float *fp32_data = new float[num];
+  for (int i = 0; i < num; i++) {
+    fp32_data[i] = static_cast<float>(fp16_data[i]);
+  }
+  tensor.data = PaddleBuf(static_cast<void *>(fp32_data), num * sizeof(float));
+  tensor.dtype = PaddleDType::FLOAT32;
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 9b2aaa9308e5df7a1527d0fa217ab12ae1ecc156..4d0e485285146e5668793d29fd8effc789fcc339 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -493,7 +493,8 @@ class AllocatorFacadePrivate {
             "support allocating managed memory.\n"
             "If you don't actually need to use managed memory, please disable "
             "it with command `export FLAGS_use_cuda_managed_memory=false`.\n"
-            "Or you must use the gpu device that supports managed memory."));
+            "Or you must use the gpu device that supports managed memory.",
+            p.device));
       }
       return std::make_shared<CUDAManagedAllocator>(p);
     }
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index d86e5e35c08c0ef46ce86c0f372fc90f8df1811b..f5e4941d787097b5e349c0b668d6c95fad137873 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -18,6 +18,7 @@
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/platform/flags.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 PADDLE_DEFINE_EXPORTED_READONLY_bool(
     free_idle_chunk, false,
@@ -47,6 +48,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
 
 phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
     size_t unaligned_size) {
+  platform::RecordEvent("AutoGrowthBestFitAllocator::Allocate",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   size_t size = AlignedSize(unaligned_size, alignment_);
   VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
 
@@ -108,6 +111,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
 }
 
 void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
+  platform::RecordEvent("AutoGrowthBestFitAllocator::Free",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   VLOG(10) << "Free " << allocation->size()
            << " bytes, ptr = " << allocation->ptr();
   std::lock_guard<SpinLock> guard(spinlock_);
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index e7b86d6ec19c06d4ee9086590763f1afe23f99a9..8627e3e6f8811e162ce3014c01145f331a03ee4b 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace memory {
@@ -117,6 +118,8 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
 phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+  platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   ProcessUnfreedAllocations();
   VLOG(8) << "Try allocate " << size << " bytes";
   AllocationPtr underlying_allocation;
@@ -144,6 +147,8 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
 }
 
 void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
+  platform::RecordEvent("StreamSafeCUDAAllocator::Free",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
       dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
diff --git a/paddle/fluid/memory/cuda_managed_memory_test.cu b/paddle/fluid/memory/cuda_managed_memory_test.cu
index 4243c5fa90f7fad4f7a98a9d87545ef66cbe9875..f8c9ff82f57127d43bba8e7e03770dd3280832a8 100644
--- a/paddle/fluid/memory/cuda_managed_memory_test.cu
+++ b/paddle/fluid/memory/cuda_managed_memory_test.cu
@@ -128,6 +128,9 @@ TEST(ManagedMemoryTest, OversubscribeGPUMemoryTest) {
 }
 
 TEST(ManagedMemoryTest, OOMExceptionTest) {
+  if (!platform::IsGPUManagedMemorySupported(0)) {
+    return;
+  }
   EXPECT_THROW(Alloc(platform::CUDAPlace(0), size_t(1) << 60),
                memory::allocation::BadAlloc);
 }
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index a71e5fe9877c5b9bf886cb6afedb2ac7c4aab155..166cdd0b5d6b6a523cfe470662951184ebbfabc5 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -246,7 +246,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
           << dst_place << " by thream(" << stream << ")";
 
   if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
+    platform::RecordEvent record_event(
+        "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
@@ -256,7 +257,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
 
-    platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
+    platform::RecordEvent record_event(
+        "NpuMemcpySync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
   }
 }
@@ -275,14 +277,16 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
           << dst_place << " by thream(" << stream << ")";
 
   if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
+    platform::RecordEvent record_event(
+        "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
 
-    platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU");
+    platform::RecordEvent record_event(
+        "NpuMemcpySync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
   }
 }
@@ -300,7 +304,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
   if (dst_place == src_place) {
     platform::SetNPUDeviceId(src_place.device);
     if (stream) {
-      platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU");
+      platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                reinterpret_cast<aclrtStream>(stream));
     } else {
@@ -308,7 +314,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
           platform::DeviceContextPool::Instance();
       static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
 
-      platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
+      platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
     }
   } else {
@@ -318,7 +326,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
     }
     if (stream) {
       // TODO(zhiqiu): support peer access?
-      platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU");
+      platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                reinterpret_cast<aclrtStream>(stream));
     } else {
@@ -326,7 +336,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
           platform::DeviceContextPool::Instance();
       static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
 
-      platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
+      platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
     }
   }
@@ -374,14 +386,18 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
           << dst_place << " by thream(" << stream << ")";
 
   if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned");
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
 
-    platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned");
+    platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
   }
 }
@@ -398,7 +414,9 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
           << dst_place << " by thream(" << stream << ")";
 
   if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU");
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
@@ -408,7 +426,9 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
 
-    platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU");
+    platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
   }
 }
@@ -596,7 +616,8 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by stream(" << stream << ")";
   if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
+    platform::RecordEvent record_event(
+        "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
@@ -605,7 +626,8 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
-    platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
+    platform::RecordEvent record_event(
+        "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
 #else
@@ -628,7 +650,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
+    platform::RecordEvent record_event(
+        "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
@@ -637,7 +660,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
-    platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
+    platform::RecordEvent record_event(
+        "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
 #else
@@ -661,7 +685,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
   if (dst_place == src_place) {
     platform::SetDeviceId(src_place.device);
     if (stream) {
-      platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
+      platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
 #ifdef PADDLE_WITH_HIP
       platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
@@ -670,7 +696,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                reinterpret_cast<gpuStream_t>(stream));
 #endif
     } else {
-      platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
+      platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
 #ifdef PADDLE_WITH_HIP
       platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice);
 #else
@@ -679,11 +707,15 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     }
   } else {
     if (stream) {
-      platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
+      platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
                                    num, reinterpret_cast<gpuStream_t>(stream));
     } else {
-      platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
+      platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
                                   num);
     }
@@ -729,7 +761,9 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
+    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
@@ -738,7 +772,9 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
-    platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
+    platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
 #else
@@ -758,7 +794,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
+    platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
@@ -767,7 +805,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
-    platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
+    platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
 #else
@@ -927,7 +967,9 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
   if (stream) {
     VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place << " by mlu stream(" << stream << ")";
-    platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU");
+    platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::MLUMemcpyD2HAsync(dst, src, num,
                                 reinterpret_cast<mluStream>(stream));
   } else {
@@ -936,7 +978,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
 
     VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place;
-    platform::RecordEvent record_event("MLUMemcpyD2HSync:MLU->CPU");
+    platform::RecordEvent record_event(
+        "MLUMemcpyD2HSync:MLU->CPU", platform::TracerEventType::UserDefined, 1);
     platform::MLUMemcpyD2HSync(dst, src, num);
   }
 }
@@ -953,7 +996,9 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
   if (stream) {
     VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place << " by mlu stream(" << stream << ")";
-    platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU");
+    platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::MLUMemcpyH2DAsync(dst, src, num,
                                 reinterpret_cast<mluStream>(stream));
   } else {
@@ -962,7 +1007,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
 
     VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place;
-    platform::RecordEvent record_event("MLUMemcpyH2DSync:CPU->MLU");
+    platform::RecordEvent record_event(
+        "MLUMemcpyH2DSync:CPU->MLU", platform::TracerEventType::UserDefined, 1);
     platform::MLUMemcpyH2DSync(dst, src, num);
   }
 }
@@ -980,8 +1026,9 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
     if (stream) {
       VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place << " by mlu stream(" << stream << ")";
-      platform::RecordEvent record_event(
-          "MLUMemcpyD2DAsync(same_mlu):MLU->MLU");
+      platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::MLUMemcpyD2DAsync(dst, src, num,
                                   reinterpret_cast<mluStream>(stream));
     } else {
@@ -991,20 +1038,26 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
 
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
-      platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU");
+      platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::MLUMemcpyD2DSync(dst, src, num);
     }
   } else {
     if (stream) {
       VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place << " by mlu stream(" << stream << ")";
-      platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU");
+      platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
                                    num, reinterpret_cast<mluStream>(stream));
     } else {
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
-      platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU");
+      platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
                                   num);
     }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index a279c76430f1b046a4c3ca05485824d5e3b62de2..91a0352e1915e95378012aa398ff996cbc10f216 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -100,7 +100,7 @@ else()
     cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten pten_api_utils gather_scatter_kernel)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel)
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index 149a87fe32da16e850d5d64fb519c9bde7afef62..c28026a4bd43aac5b0c447e24a164e27233076e8 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -16,7 +16,10 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -27,16 +30,6 @@ namespace operators {
 class AbsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "abs");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "abs");
-
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -148,11 +141,15 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(abs, ops::AbsOp, ops::AbsOpMaker,
                   ops::AbsGradMaker<paddle::framework::OpDesc>,
-                  ops::AbsGradMaker<paddle::imperative::OpBase>);
+                  ops::AbsGradMaker<paddle::imperative::OpBase>,
+                  AbsInferShapeFunctor);
 
 REGISTER_OPERATOR(abs_grad, ops::AbsGradOp,
                   ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index 915b4daeeb525f15e9db0f63d0f2212f31143fea..de4d7818020dd586547ff9eedb53108285048c09 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/addmm_op.h"
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -24,6 +27,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+constexpr int kMULMKLDNNINT8 = 1;
+
 using framework::OpKernelType;
 using framework::Tensor;
 
@@ -31,85 +36,6 @@ class AddMMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      platform::errors::NotFound(
-                          "Input(Input) of AddMMOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::NotFound("Input(X) of AddMMOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"), true,
-        platform::errors::NotFound("Input(Y) of AddMMOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound(
-                          "Output(Out) of AddMMOp should not be null."));
-
-    auto input_dims = ctx->GetInputDim("Input");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    auto ndim_input = input_dims.size();
-    auto ndim_x = x_dims.size();
-    auto ndim_y = y_dims.size();
-
-    float alpha = ctx->Attrs().Get<float>("Alpha");
-    float beta = ctx->Attrs().Get<float>("Beta");
-
-    VLOG(3) << "addmm operator input.shape=" << input_dims
-            << " x.shape=" << x_dims << " y.shape=" << y_dims
-            << " beta=" << beta << " alpha=" << alpha
-            << " ndim_input=" << ndim_input << " ndim_x=" << ndim_x
-            << " ndim_y=" << ndim_y;
-
-    PADDLE_ENFORCE_NE(phi::product(input_dims), 0,
-                      platform::errors::PreconditionNotMet(
-                          "The Input variable Input(%s) has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function.",
-                          ctx->Inputs("Input").front()));
-
-    PADDLE_ENFORCE_NE(phi::product(x_dims), 0,
-                      platform::errors::PreconditionNotMet(
-                          "The Input variable X(%s) has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function.",
-                          ctx->Inputs("X").front()));
-
-    PADDLE_ENFORCE_NE(phi::product(y_dims), 0,
-                      platform::errors::PreconditionNotMet(
-                          "The Input variable Y(%s) has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function.",
-                          ctx->Inputs("Y").front()));
-    // dim check
-    PADDLE_ENFORCE_EQ(ndim_input, 2,
-                      platform::errors::InvalidArgument(
-                          "The input tensor input's dimension must be 2. "
-                          "But received input's dimension = [%s].",
-                          ndim_input));
-    PADDLE_ENFORCE_EQ(ndim_x, 2,
-                      platform::errors::InvalidArgument(
-                          "The input tensor x's dimension must be 2. "
-                          "But received x's dimension = [%s].",
-                          ndim_x));
-    PADDLE_ENFORCE_EQ(ndim_y, 2,
-                      platform::errors::InvalidArgument(
-                          "The input tensor y's dimension must be 2. "
-                          "But received y's dimension = [%s].",
-                          ndim_y));
-
-    std::vector<int64_t> output_dims;
-    output_dims.push_back(x_dims[0]);
-    output_dims.push_back(y_dims[1]);
-
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dims));
-    ctx->ShareLoD("Input", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const {
     framework::LibraryType library = framework::LibraryType::kPlain;
@@ -221,17 +147,11 @@ class AddMMOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
+DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
+                            PT_INFER_META(phi::AddmmInferMeta));
 REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker,
                   ops::AddMMOpGradMaker<paddle::framework::OpDesc>,
-                  ops::AddMMOpGradMaker<paddle::imperative::OpBase>);
+                  ops::AddMMOpGradMaker<paddle::imperative::OpBase>,
+                  AddmmInferShapeFunctor);
 
 REGISTER_OPERATOR(addmm_grad, ops::AddMMGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    addmm, ops::AddMMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AddMMKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    addmm_grad, ops::AddMMGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AddMMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/addmm_op.cu b/paddle/fluid/operators/addmm_op.cu
deleted file mode 100644
index e42d9c84f9234a756362acd67029b2ace4f6c9fb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/addmm_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/addmm_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(addmm, ops::AddMMKernel<plat::CUDADeviceContext, float>,
-                        ops::AddMMKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(addmm_grad,
-                        ops::AddMMGradKernel<plat::CUDADeviceContext, float>,
-                        ops::AddMMGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h
deleted file mode 100644
index 9d225ba99919249982924e382f2661d7481ed0c7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/addmm_op.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <boost/preprocessor/repetition/repeat.hpp>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
-using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
-
-using Tensor = framework::Tensor;
-
-constexpr int kMULMKLDNNINT8 = 1;
-
-template <typename DeviceContext, typename T>
-class AddMMKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* y = context.Input<Tensor>("Y");
-
-    auto input_dims = input->dims();
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-
-    // broadcast mode check
-    if (x_dims[0] != input_dims[0]) {
-      PADDLE_ENFORCE_EQ(input_dims[0], 1,
-                        platform::errors::InvalidArgument(
-                            "When x_dims[0] is not equal with input_dims[0], "
-                            "input_dims[0] must be 1 but got %s",
-                            input_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          y_dims[1] == input_dims[1] || input_dims[1] == 1, true,
-          platform::errors::InvalidArgument(
-              "The input tensor shape mismatch, input shape=[%s], "
-              "x shape=[%s], y shape=[%s]",
-              input_dims, x_dims, y_dims));
-    }
-    // broadcast mode check
-    if (y_dims[1] != input_dims[1]) {
-      PADDLE_ENFORCE_EQ(input_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "When y_dims[1] is not equal with input_dims[0], "
-                            "input_dims[0] must be 1 but got %s",
-                            input_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          x_dims[0] == input_dims[0] || input_dims[0] == 1, true,
-          platform::errors::InvalidArgument(
-              "The input tensor shape mismatch, input shape=[%s], "
-              "x shape=[%s], y shape=[%s]",
-              input_dims, x_dims, y_dims));
-    }
-    // broadcast mode check
-    PADDLE_ENFORCE_EQ(
-        x_dims[1], y_dims[0],
-        platform::errors::InvalidArgument(
-            "The input tensor X's width must be equal with matrix Y' height. "
-            "But received X's shape = [%s], Y's shape = [%s].",
-            x_dims[1], y_dims[0]));
-
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>({x_dims[0], y_dims[1]}, context.GetPlace());
-
-    float alpha = context.template Attr<float>("Alpha");
-    float beta = context.template Attr<float>("Beta");
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
-
-    // calc broadcast dim
-    Array2 bcast_dims;
-    bcast_dims[0] = x_dims[0] / input_dims[0];
-    bcast_dims[1] = y_dims[1] / input_dims[1];
-    VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]";
-    // broadcast using eigen
-    auto eigen_input = EigenTensor<T, 2>::From(*input);
-    auto eigen_out = EigenTensor<T, 2>::From(*out);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
-        place, eigen_out, eigen_input, bcast_dims);
-
-    blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha,
-              x->data<T>(), x_dims[1], y->data<T>(), y_dims[1], beta,
-              out->data<T>(), y_dims[1]);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AddMMGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto in_dims = ctx.Input<framework::LoDTensor>("Input")->dims();
-    auto* dinput =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Input"));
-    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
-
-    float alpha = ctx.Attr<float>("Alpha");
-    float beta = ctx.Attr<float>("Beta");
-
-    int total_elems = 0;
-
-    VLOG(3) << "alpha: " << alpha << " beta: " << beta;
-
-    if (dinput != nullptr) {
-      dinput->set_lod(dout->lod());
-    }
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    if (dinput) {
-      dinput->mutable_data<T>(ctx.GetPlace());
-      total_elems = in_dims[0] * in_dims[1];
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto eigen_dout = EigenTensor<T, 2>::From(*dout);
-      auto eigen_dinput = EigenTensor<T, 2>::From(*dinput);
-
-      bool row_compress = in_dims[0] != dout->dims()[0];
-      bool col_compress = in_dims[1] != dout->dims()[1];
-      auto eigen_dinput_shape = Array2(dinput->dims()[0], dinput->dims()[1]);
-
-      if (row_compress && col_compress) {
-        eigen_dinput.device(place) =
-            eigen_dout.sum().eval().reshape(eigen_dinput_shape);
-      } else if (row_compress) {
-        eigen_dinput.device(place) =
-            eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape);
-      } else if (col_compress) {
-        eigen_dinput.device(place) =
-            eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape);
-      } else {
-        blas.VCOPY(total_elems, dout->data<T>(), dinput->data<T>());
-      }
-
-      blas.SCAL(total_elems, beta, dinput->data<T>());
-    }
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      total_elems = x->dims()[0] * x->dims()[1];
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(*dout, false, *y, true, dx);
-      blas.SCAL(total_elems, alpha, dx->data<T>());
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      total_elems = x->dims()[1] * y->dims()[1];
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(*x, true, *dout, false, dy);
-      blas.SCAL(total_elems, alpha, dy->data<T>());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc
index 8ee6540bfa5f0c413f759f58ab506ac181c19c49..71a895c244c54f62c0af1745635c08fea35436c4 100644
--- a/paddle/fluid/operators/atan2_op.cc
+++ b/paddle/fluid/operators/atan2_op.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/atan2_op.h"
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,16 +25,6 @@ namespace operators {
 class Atan2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X1"), "Input", "X1", "atan2");
-    OP_INOUT_CHECK(ctx->HasInput("X2"), "Input", "X2", "atan2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "atan2");
-
-    auto in_dims = ctx->GetInputDim("X1");
-
-    ctx->SetOutputDim("Out", in_dims);
-  }
 };
 
 class Atan2OpMaker : public framework::OpProtoAndCheckerMaker {
@@ -115,24 +105,11 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
+DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
+                            PT_INFER_META(phi::Atan2InferMeta));
 REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker,
                   ops::Atan2GradMaker<paddle::framework::OpDesc>,
                   ops::Atan2GradMaker<paddle::imperative::OpBase>,
-                  ops::Atan2OpVarTypeInference);
+                  ops::Atan2OpVarTypeInference, Atan2InferShapeFunctor);
 
 REGISTER_OPERATOR(atan2_grad, ops::Atan2GradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    atan2, ops::Atan2Kernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::Atan2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Atan2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Atan2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Atan2Kernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::float16>);
-
-REGISTER_OP_CPU_KERNEL(
-    atan2_grad, ops::Atan2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Atan2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Atan2GradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::float16>);
diff --git a/paddle/fluid/operators/atan2_op.cu b/paddle/fluid/operators/atan2_op.cu
deleted file mode 100644
index faf1fde47e4c45a00836eee1d81ed1233170ecbe..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/atan2_op.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/atan2_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    atan2, ops::Atan2Kernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::Atan2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Atan2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Atan2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Atan2Kernel<paddle::platform::CUDADeviceContext,
-                     paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    atan2_grad,
-    ops::Atan2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Atan2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Atan2GradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::float16>);
diff --git a/paddle/fluid/operators/atan2_op.h b/paddle/fluid/operators/atan2_op.h
deleted file mode 100644
index a0e64c301524e2051abf8d2fc1641e0bcfafe69d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/atan2_op.h
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using framework::To32BitIndex;
-
-template <typename T>
-struct Atan2Out {
-  using type = T;
-};
-
-template <>
-struct Atan2Out<int32_t> {
-  using type = double;
-};
-
-template <>
-struct Atan2Out<int64_t> {
-  using type = double;
-};
-
-template <typename T>
-struct Atan2Functor {
-  Atan2Functor(const T* x1, const T* x2, typename Atan2Out<T>::type* out,
-               int64_t numel)
-      : x1_(x1), x2_(x2), out_(out), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    out_[idx] = static_cast<typename Atan2Out<T>::type>(
-        ::atan2f(static_cast<float>(x1_[idx]), static_cast<float>(x2_[idx])));
-  }
-
-  const T* x1_;
-  const T* x2_;
-  typename Atan2Out<T>::type* out_;
-  int64_t numel_;
-};
-
-template <>
-struct Atan2Functor<double> {
-  Atan2Functor(const double* x1, const double* x2, double* out, int64_t numel)
-      : x1_(x1), x2_(x2), out_(out), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    out_[idx] = ::atan2(x1_[idx], x2_[idx]);
-  }
-
-  const double* x1_;
-  const double* x2_;
-  double* out_;
-  int64_t numel_;
-};
-
-// dx1 = dout * x2 / ((x1)^2 + (x2)^2)
-// dx2 = - dout * x1 / ((x1)^2 + (x2)^2)
-template <typename T>
-struct Atan2GradFunctor {
-  Atan2GradFunctor(const T* x1, const T* x2, const T* dout, T* dx1, T* dx2,
-                   int64_t numel)
-      : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    float x1 = static_cast<float>(x1_[idx]);
-    float x2 = static_cast<float>(x2_[idx]);
-    float x = x1 * x1 + x2 * x2;
-    dx1_[idx] = static_cast<T>(static_cast<float>(dout_[idx]) * x2 / x);
-    dx2_[idx] = static_cast<T>(-static_cast<float>(dout_[idx]) * x1 / x);
-  }
-
-  const T* x1_;
-  const T* x2_;
-  const T* dout_;
-  T* dx1_;
-  T* dx2_;
-  int64_t numel_;
-};
-
-template <>
-struct Atan2GradFunctor<double> {
-  Atan2GradFunctor(const double* x1, const double* x2, const double* dout,
-                   double* dx1, double* dx2, int64_t numel)
-      : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    auto x = x1_[idx] * x1_[idx] + x2_[idx] * x2_[idx];
-    dx1_[idx] = dout_[idx] * x2_[idx] / x;
-    dx2_[idx] = -dout_[idx] * x1_[idx] / x;
-  }
-
-  const double* x1_;
-  const double* x2_;
-  const double* dout_;
-  double* dx1_;
-  double* dx2_;
-  int64_t numel_;
-};
-
-template <typename DeviceContext, typename T>
-class Atan2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* X1 = context.Input<Tensor>("X1");
-    const Tensor* X2 = context.Input<Tensor>("X2");
-    Tensor* Out = context.Output<Tensor>("Out");
-
-    auto numel = X1->numel();
-    auto x1 = X1->data<T>();
-    auto x2 = X2->data<T>();
-    auto out = Out->mutable_data<typename Atan2Out<T>::type>(
-        context.GetPlace(), size_t(numel * sizeof(typename Atan2Out<T>::type)));
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    Atan2Functor<T> functor(x1, x2, out, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Atan2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const {
-    const Tensor* X1 = context.Input<Tensor>("X1");
-    const Tensor* X2 = context.Input<Tensor>("X2");
-    const Tensor* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* dX1 = context.Output<Tensor>(framework::GradVarName("X1"));
-    Tensor* dX2 = context.Output<Tensor>(framework::GradVarName("X2"));
-
-    auto numel = X1->numel();
-    auto x1 = X1->data<T>();
-    auto x2 = X2->data<T>();
-    auto dout = dOut->data<T>();
-    auto dx1 =
-        dX1->mutable_data<T>(context.GetPlace(), size_t(numel * sizeof(T)));
-    auto dx2 =
-        dX2->mutable_data<T>(context.GetPlace(), size_t(numel * sizeof(T)));
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    Atan2GradFunctor<T> functor(x1, x2, dout, dx1, dx2, numel);
-    for_range(functor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
index 505acbbdbde1b0f5842ebbd272f4bfb930e812b7..6699df0c8dc59cbd4ce14a1f6d2b6523f21b590d 100644
--- a/paddle/fluid/operators/batch_norm_op_xpu.cc
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -38,23 +38,25 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
     bool global_stats = test_mode || use_global_stats;
     const auto &data_layout_str = ctx.Attr<std::string>("data_layout");
     const auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW,
+    PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC",
+                      true,
                       platform::errors::InvalidArgument(
-                          "The 'data_layout' attribute must be NCHW. But "
-                          "recevived 'data_layout' is [%s].",
+                          "The 'data_layout' attribute must be NCHW or NHWC. "
+                          "But recevived 'data_layout' is [%s].",
                           data_layout_str));
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimension must equal to 4. But "
-                          "received X's shape = [%s], X's dimension = [%d].",
-                          x_dims, x_dims.size()));
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int H = x_dims[2];
-    const int W = x_dims[3];
+    PADDLE_ENFORCE_EQ(
+        x_dims.size() >= 2 && x_dims.size() <= 5, true,
+        platform::errors::InvalidArgument(
+            "The size of input's dimensions should be between 2 and 5"
+            "But received: the size of input's dimensions is [%d]",
+            x_dims.size()));
+
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
     const auto *x_data = x->data<T>();
@@ -75,6 +77,7 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
     saved_variance->mutable_data<float>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    bool is_nchw = data_layout_str == "NCHW";
 
     if (!global_stats) {
       auto *mean_out_data = mean_out->data<float>();
@@ -95,7 +98,7 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
       int r = xpu::batch_norm<T>(dev_ctx.x_context(), x_data, y_data, N, C, H,
                                  W, epsilon, momentum, scale_data, bias_data,
                                  saved_mean_data, saved_variance_data,
-                                 mean_out_data, variance_out_data, true);
+                                 mean_out_data, variance_out_data, is_nchw);
       PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
                         platform::errors::External(
                             "The batch_norm XPU API return wrong value[%d %s]",
@@ -107,7 +110,7 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
       const auto *variance_data = variance->data<float>();
       int r = xpu::batch_norm_infer(dev_ctx.x_context(), x_data, y_data, N, C,
                                     H, W, epsilon, scale_data, bias_data,
-                                    mean_data, variance_data, true);
+                                    mean_data, variance_data, is_nchw);
       PADDLE_ENFORCE_EQ(
           r, xpu::Error_t::SUCCESS,
           platform::errors::External(
@@ -168,11 +171,11 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
     const float epsilon = ctx.Attr<float>("epsilon");
     const auto data_layout = framework::StringToDataLayout(data_layout_str);
 
-    // TODO(guozbin): Transform input tensor from NHWC to NCHW
-    PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW,
+    PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC",
+                      true,
                       platform::errors::InvalidArgument(
-                          "The 'data_layout' attribute must be NCHW. But "
-                          "recevived 'data_layout' is [%s].",
+                          "The 'data_layout' attribute must be NCHW or NHWC. "
+                          "But recevived 'data_layout' is [%s].",
                           data_layout_str));
 
     auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
@@ -207,15 +210,15 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
     }
 
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimension must equal to 4. But "
-                          "received X's shape = [%s], X's dimension = [%d].",
-                          x_dims, x_dims.size()));
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int H = x_dims[2];
-    const int W = x_dims[3];
+    PADDLE_ENFORCE_EQ(
+        x_dims.size() >= 2 && x_dims.size() <= 5, true,
+        platform::errors::InvalidArgument(
+            "The size of input's dimensions should be between 2 and 5"
+            "But received: the size of input's dimensions is [%d]",
+            x_dims.size()));
+
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     const auto *x_data = x->data<T>();
     const auto *d_y_data = d_y->data<T>();
@@ -250,38 +253,35 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
 
-    const T *mean_data = nullptr;
-    const T *inv_var_data = nullptr;
+    const auto *batch_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *batch_inv_std = ctx.Input<Tensor>("SavedVariance");
+    const auto *global_mean = ctx.Input<Tensor>("Mean");
+    const auto *global_var = ctx.Input<Tensor>("Variance");
 
     // TODO(guozibin): hadle the situation case of N * H * W = 1
-    if (!use_global_stats) {
-      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-      // SavedVariance have been reverted in forward operator
-      const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-      mean_data = saved_mean->data<float>();
-      inv_var_data = saved_inv_variance->data<float>();
-    } else {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<float>();
-      inv_var_data = running_variance->data<float>();
-      float *running_inv_var_data =
-          RAII_GUARD.alloc_l3_or_gm<float>(running_variance->numel());
-      float *epsilon_data = RAII_GUARD.alloc_l3_or_gm<float>(1);
-      int r1 = calculate_inv_var(dev_ctx.x_context(), inv_var_data, epsilon, C,
-                                 epsilon_data, running_inv_var_data);
-      PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External(
-                                             "XPU API(batch_norm_grad "
-                                             "calculate_inv_var function) "
-                                             "return wrong value[%d %s]",
-                                             r1, XPUAPIErrorMsg[r1]));
-      inv_var_data = running_inv_var_data;
-    }
     if (is_inplace) {
+      float *global_inv_std_data;
+      if (use_global_stats) {
+        global_inv_std_data =
+            RAII_GUARD.alloc_l3_or_gm<float>(global_var->numel());
+        float *epsilon_data = RAII_GUARD.alloc_l3_or_gm<float>(1);
+        int r1 =
+            calculate_inv_var(dev_ctx.x_context(), global_var->data<float>(),
+                              epsilon, C, epsilon_data, global_inv_std_data);
+        PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External(
+                                               "XPU API(batch_norm_grad "
+                                               "calculate_inv_var function) "
+                                               "return wrong value[%d %s]",
+                                               r1, XPUAPIErrorMsg[r1]));
+      }
       auto px = *x;
+      auto *inv_std_data =
+          use_global_stats ? global_inv_std_data : batch_inv_std->data<float>();
+      auto mean_data = use_global_stats ? global_mean->data<float>()
+                                        : batch_mean->data<float>();
       int r2 = calculate_inv_BN_Y(
           dev_ctx.x_context(), px.mutable_data<T>(ctx.GetPlace()),
-          scale->data<float>(), bias->data<float>(), mean_data, inv_var_data, N,
+          scale->data<float>(), bias->data<float>(), mean_data, inv_std_data, N,
           C, H * W, x->data<T>());
       PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, platform::errors::External(
                                              "XPU API(batch_norm_grad "
@@ -289,19 +289,29 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
                                              "return wrong value[%d %s]",
                                              r2, XPUAPIErrorMsg[r2]));
     }
-    if (!d_x) {
-      d_x_data = RAII_GUARD.alloc_l3_or_gm<T>(x->numel());
-    }
-    if (!d_scale) {
-      d_scale_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
-    }
-    if (!d_bias_data) {
-      d_bias_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
-    }
 
-    int r3 = xpu::batch_norm_grad<T>(
-        dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, scale_data,
-        mean_data, inv_var_data, d_scale_data, d_bias_data, true);
+    int r3;
+    bool is_nchw = data_layout_str == "NCHW";
+    if (use_global_stats) {
+      r3 = xpu::batch_norm_grad<T>(
+          dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W,
+          scale_data, nullptr, nullptr, d_scale_data, d_bias_data, is_nchw,
+          global_mean->data<float>(), global_var->data<float>(), epsilon);
+    } else {
+      if (!d_x) {
+        d_x_data = RAII_GUARD.alloc_l3_or_gm<T>(x->numel());
+      }
+      if (!d_scale) {
+        d_scale_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
+      }
+      if (!d_bias_data) {
+        d_bias_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
+      }
+      r3 = xpu::batch_norm_grad<T>(
+          dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W,
+          scale_data, batch_mean->data<float>(), batch_inv_std->data<float>(),
+          d_scale_data, d_bias_data, is_nchw);
+    }
     PADDLE_ENFORCE_EQ(r3, XPU_SUCCESS, platform::errors::External(
                                            "XPU API(batch_norm_grad) return "
                                            "wrong value[%d %s]",
diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 1c390923d0b0ad92f73eced96a79771db7ad4010..55bb57466c7b5ec4f4ac3c51b1cf84ab5098a0e9 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bce_loss_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -26,41 +29,6 @@ class BCELossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BCELoss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "BCELoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) and Input(Label) shall have the same rank."
-                          "But received: the rank of Input(X) is [%d], "
-                          "the rank of Input(Label) is [%d].",
-                          rank, labels_dims.size()));
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
-                        platform::errors::InvalidArgument(
-                            "Input(X) and Input(Label) shall have the same "
-                            "shape. But received: the shape of Input(X) is "
-                            "[%s], the shape of Input(Label) is [%s].",
-                            x_dims, labels_dims));
-    }
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareLoD("X", "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -170,16 +138,12 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
+                            PT_INFER_META(phi::BCELossInferMeta));
+
 REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker,
                   ops::BCELossGradOpMaker<paddle::framework::OpDesc>,
                   ops::BCELossGradOpMaker<paddle::imperative::OpBase>,
-                  ops::BCELossInplaceInferer);
+                  ops::BCELossInplaceInferer, BCELossInferShapeFunctor);
 REGISTER_OPERATOR(bce_loss_grad, ops::BCELossGradOp,
                   ops::BCELossGradInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    bce_loss, ops::BCELossOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BCELossOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    bce_loss_grad,
-    ops::BCELossGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BCELossGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
deleted file mode 100644
index f71fbbdc6b19e9a2c71b5194e8f2343d2398d62a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <algorithm>
-#include "paddle/fluid/operators/bce_loss_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-template <typename T>
-struct BCELossFunctor {
-  T one;
-  T neg_100;
-
-  HOSTDEVICE inline BCELossFunctor() {
-    one = static_cast<T>(1.0f);
-    neg_100 = static_cast<T>(-100.);
-  }
-
-  HOSTDEVICE inline T operator()(const T x, const T label) const {
-    PADDLE_ENFORCE(
-        (x >= static_cast<T>(0)) && (x <= one),
-        "Input is expected to be within the interval [0, 1], but recieved %f.",
-        x);
-    T term1 = max(real_log(x), neg_100);
-    T term2 = max(real_log(one - x), neg_100);
-    return (((label - one) * term2) - (label * term1));
-  }
-};
-
-template <typename T>
-struct BCELossGradFunctor {
-  T one;
-  T eps;
-
-  HOSTDEVICE inline BCELossGradFunctor() {
-    one = static_cast<T>(1.0f);
-    eps = static_cast<T>(1e-12);
-  }
-
-  HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const {
-    T term1 = max((one - x) * x, eps);
-    return (dout * (x - label) / term1);
-  }
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class BCELossCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    std::vector<const framework::Tensor*> ins = {x, labels};
-    std::vector<framework::Tensor*> outs = {out};
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto functor = BCELossFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BCELossGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    std::vector<const framework::Tensor*> ins = {x, labels, dout};
-    std::vector<framework::Tensor*> outs = {dx};
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto functor = BCELossGradFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bce_loss,
-    ops::BCELossCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BCELossCUDAKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    bce_loss_grad,
-    ops::BCELossGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BCELossGradCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bce_loss_op.h b/paddle/fluid/operators/bce_loss_op.h
deleted file mode 100644
index dd87b69efe2869727f2db778cec44612efbcff6b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bce_loss_op.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>  // for max
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class BCELossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto x_data = x->data<T>();
-    auto label_data = labels->data<T>();
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto x_numel = x->numel();
-
-    // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 -
-    // x) - label * ln(x)
-    for (int64_t i = 0; i < x_numel; ++i) {
-      PADDLE_ENFORCE_GE(
-          x_data[i], static_cast<T>(0),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be greater than  or equal to 0"));
-      PADDLE_ENFORCE_LE(
-          x_data[i], static_cast<T>(1),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be less than or equal to 1"));
-      out_data[i] =
-          (label_data[i] - static_cast<T>(1)) *
-              std::max(real_log(static_cast<T>(1) - x_data[i]), (T)(-100)) -
-          label_data[i] * std::max(real_log(x_data[i]), (T)(-100));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BCELossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto x_data = x->data<T>();
-    auto label_data = labels->data<T>();
-
-    int x_numel = x->numel();
-
-    // dx = dout * ((x - label)/(x - x^2))
-    for (int i = 0; i < x_numel; ++i) {
-      dx_data[i] =
-          dout_data[i] * ((x_data[i] - label_data[i]) /
-                          std::max((static_cast<T>(1) - x_data[i]) * x_data[i],
-                                   static_cast<T>(1e-12)));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc
index 46e8a36d2eef73e59bfc22308e5c0b593bd2832d..c3cee6a7b0d5bf4b2e41bfc020e6c9fcd34677d9 100644
--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ b/paddle/fluid/operators/bce_loss_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bce_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index 915ad2f41cde33ee9519b06b38bb8a59fd37793b..4b1593b1f8b40c0c4380007f85f9bb74fea9cd44 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 253a96004bd30a2d6c0da456c578e8dc4b522cca..4774c0a1dbc3b78607d75efb7bc82d590ca4aa2a 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -12,84 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
-#include <memory>
-#include <string>
-#include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class BilinearTensorProductOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::InvalidArgument("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"), true,
-        platform::errors::InvalidArgument("Input(Y) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Weight"), true,
-        platform::errors::InvalidArgument("Input(Weight) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::InvalidArgument("Output(Out) should not be null."));
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto weight_dims = ctx->GetInputDim("Weight");
-
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(), 2UL,
-        platform::errors::InvalidArgument("The input(X) must be a 2D Tensor."));
-    PADDLE_ENFORCE_EQ(
-        y_dims.size(), 2UL,
-        platform::errors::InvalidArgument("The input(Y) must be a 2D Tensor."));
-    PADDLE_ENFORCE_EQ(
-        weight_dims.size(), 3UL,
-        platform::errors::InvalidArgument("Expected the input(Weight) is a 3D "
-                                          "tensor. But received %dD tensor.",
-                                          weight_dims.size()));
-    if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0)) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0], y_dims[0],
-          platform::errors::InvalidArgument(
-              "The first dimension(batch_size) of input(X) must be "
-              "equal to the first dimension of the input(Y)."));
-    }
-    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
-                      platform::errors::InvalidArgument(
-                          "The second dimension of input(X) must be equal to "
-                          "the second dimension of the input(Weight)."));
-    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
-                      platform::errors::InvalidArgument(
-                          "The second dimension of input(Y) must be equal to "
-                          "the third dimension of the input(Weight)."));
-
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      PADDLE_ENFORCE_EQ(bias_dims.size(), 2UL,
-                        platform::errors::InvalidArgument(
-                            "The Input(Bias) must be a 2-D tensor with "
-                            "the 2nd dimension fixed to 1 (a row vector)."));
-      PADDLE_ENFORCE_EQ(bias_dims[0], 1UL,
-                        platform::errors::InvalidArgument(
-                            "The Input(Bias) must be a 2-D tensor with "
-                            "the 2nd dimension fixed to 1 (a row vector)."));
-      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The second dimension of input(Bias) must be equal "
-                            "to the first dimension of the input(Weight)."));
-    }
-
-    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -125,59 +59,6 @@ Where $W_i$ is the $i$-th slice of Input(Weight);
 class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::InvalidArgument("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"), true,
-        platform::errors::InvalidArgument("Input(Y) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Weight"), true,
-        platform::errors::InvalidArgument("Input(Weight) should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) should not be null."));
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
-                      platform::errors::InvalidArgument(
-                          "The input(Out@GRAD) must be a 2D Tensor."));
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], out_dims[0],
-        platform::errors::InvalidArgument(
-            "The first dimension(batch_size) of input(Out@GRAD) must be "
-            "equal to the first dimension of the Input(X)."));
-    PADDLE_ENFORCE_EQ(
-        weight_dims[0], out_dims[1],
-        platform::errors::InvalidArgument(
-            "The second dimension of input(Out@GRAD) must be equal to "
-            "the third dimension of the Input(Weight)."));
-
-    auto bias_grad_name = framework::GradVarName("Bias");
-    if (ctx->HasOutput(bias_grad_name)) {
-      ctx->SetOutputDim(bias_grad_name, {1, out_dims[1]});
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    auto weight_grad_name = framework::GradVarName("Weight");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-    if (ctx->HasOutput(weight_grad_name)) {
-      ctx->SetOutputDim(weight_grad_name, weight_dims);
-    }
-  }
 };
 
 template <typename T>
@@ -208,21 +89,20 @@ class BilinearTensorProductGradOpMaker
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
+                            BilinearTensorProductInferShapeFunctor,
+                            PT_INFER_META(phi::BilinearTensorProductInferMeta));
+DELCARE_INFER_SHAPE_FUNCTOR(
+    bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor,
+    PT_INFER_META(phi::BilinearTensorProductGradInferMeta));
+
 REGISTER_OPERATOR(
     bilinear_tensor_product, ops::BilinearTensorProductOp,
     ops::BilinearTensorProductOpMaker,
     ops::BilinearTensorProductGradOpMaker<paddle::framework::OpDesc>,
-    ops::BilinearTensorProductGradOpMaker<paddle::imperative::OpBase>);
+    ops::BilinearTensorProductGradOpMaker<paddle::imperative::OpBase>,
+    BilinearTensorProductInferShapeFunctor);
 REGISTER_OPERATOR(bilinear_tensor_product_grad,
-                  ops::BilinearTensorProductOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-REGISTER_OP_CPU_KERNEL(
-    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
-                                         float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
-                                         double>);
+                  ops::BilinearTensorProductOpGrad,
+                  BilinearTensorProductGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu
deleted file mode 100644
index c2b4f69e6854522b91dfd9fb5f738c0e5ffc77b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
-REGISTER_OP_CUDA_KERNEL(
-    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
-                                         float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
-                                         double>);
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
deleted file mode 100644
index 2dbe3a132d78aed1593041bd83f682250f79596c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class BilinearTensorProductKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto y_mat = EigenMatrix<T>::From(*y);
-    auto output_mat = EigenMatrix<T>::From(*out);
-
-    auto batch_size = x->dims()[0];
-    auto weight_dims = weight->dims();
-    int out_dim = weight_dims[0];
-    auto x_dim = weight_dims[1];
-    auto y_dim = weight_dims[2];
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    // Create the intermediate variable to calculate the result of
-    // Input(X) multiplied by Input(Weight_i), the formula is:
-    // left_mul = X Weight_i.
-    Tensor left_mul;
-    left_mul.mutable_data<T>(phi::make_ddim({batch_size, y_dim}),
-                             ctx.GetPlace());
-    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
-
-    for (int i = 0; i < out_dim; ++i) {
-      auto output_col_vec = output_mat.chip(i, 1);
-      Tensor weight_mat =
-          weight->Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim}));
-      phi::funcs::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
-          CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
-          weight_mat.data<T>(), 0, left_mul.data<T>());
-      output_col_vec.device(place) =
-          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
-    }
-    if (bias) {
-      auto bias_vec = EigenMatrix<T>::From(*bias);
-      Eigen::DSizes<int, 2> bcast(batch_size, 1);
-      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor* weight = ctx.Input<Tensor>("Weight");
-    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto batch_size = x->dims()[0];
-    auto weight_dims = weight->dims();
-    int out_dim = weight_dims[0];
-    auto x_dim = weight_dims[1];
-    auto y_dim = weight_dims[2];
-
-    auto x_mat = EigenMatrix<T>::From(*x);
-    auto y_mat = EigenMatrix<T>::From(*y);
-    auto d_out_mat = EigenMatrix<T>::From(*d_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // Create the intermediate variable to calculate the Output(Y@Grad).
-    Tensor x_scale;
-    x_scale.mutable_data<T>(phi::make_ddim({batch_size, x_dim}),
-                            ctx.GetPlace());
-    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
-
-    // Create the intermediate variable to calculate the Output(X@Grad).
-    Tensor y_scale;
-    y_scale.mutable_data<T>(phi::make_ddim({batch_size, y_dim}),
-                            ctx.GetPlace());
-    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_x, static_cast<T>(0));
-    }
-
-    if (d_y) {
-      d_y->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_y, static_cast<T>(0));
-    }
-
-    if (d_weight) {
-      d_weight->mutable_data<T>(ctx.GetPlace());
-    }
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-    // Caculate the Output(X@Grad) and Output(Y@Grad).
-    if (d_x || d_y || d_weight) {
-      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
-      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
-      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
-
-      for (int i = 0; i < out_dim; ++i) {
-        Tensor weight_i =
-            weight->Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim}));
-        auto output_vec = d_out_mat.chip(i, 1);
-
-        if (d_x) {
-          y_scale_mat.device(place) =
-              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_x) *
-              y_mat;
-          blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
-                    y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
-        }
-
-        if (d_y || d_weight) {
-          auto output_vec_y =
-              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_y);
-          x_scale_mat.device(place) = output_vec_y * x_mat;
-          if (d_y) {
-            blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-                      x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
-          }
-          if (d_weight) {
-            Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
-                phi::make_ddim({x_dim, y_dim}));
-            blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
-                      x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
-          }
-        }
-      }
-    }
-
-    // calculate the gradient of Input(Bias).
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
-      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 4ca0dded3e7385234e3dc630e6260c08fb45f3a8..bc6cf9d831ff0faf00d3db7fdc6105f301781f8b 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -138,7 +138,7 @@ class CastOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
-// cast use pten kernel, so no need to REGISTER_OP_CPU_KERNEL here.
+// cast use phi kernel, so no need to REGISTER_OP_CPU_KERNEL here.
 REGISTER_OPERATOR(cast, ops::CastOp,
                   ops::CastOpGradMaker<paddle::framework::OpDesc>,
                   ops::CastOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 62d747cb9f4001e4fcee64a49ee8a16a49eb2617..034cb47fab189b3c7a712d4d720887de227d8573 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -63,12 +63,12 @@ class CastOpKernel : public framework::OpKernel<InT> {
     out->mutable_data(dev_ctx.GetPlace(),
                       static_cast<framework::proto::VarType::Type>(out_dtype));
 
-    auto pt_out_dtype = framework::TransToPtenDataType(
+    auto pt_out_dtype = framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
 
     // call new kernel
     phi::CastKernel<InT>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *in, pt_out_dtype, out);
   }
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index 25b3a446a0a32e61407d2ffa796c30d9a6625532..64324d9772b47de8dfec256f75f60873ce6aafeb 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -46,11 +46,11 @@ class CastXPUKernel : public framework::OpKernel<InT> {
     out->mutable_data(dev_ctx.GetPlace(),
                       static_cast<framework::proto::VarType::Type>(out_dtype));
 
-    auto pt_out_dtype = framework::TransToPtenDataType(
+    auto pt_out_dtype = framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
-    // call pten kernel
+    // call phi kernel
     phi::CastKernel<InT>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *in, pt_out_dtype, out);
   }
diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc
index 0902f5b6bc9e80adfb990c0bc6e80d12db408ea9..09e915a6bafd4a8b72f35995b3ebbfeafa00476a 100644
--- a/paddle/fluid/operators/cholesky_op.cc
+++ b/paddle/fluid/operators/cholesky_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cholesky_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,26 +26,6 @@ using framework::Tensor;
 class CholeskyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cholesky");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cholesky");
-    auto dims = ctx->GetInputDim("X");
-    auto rank = dims.size();
-    PADDLE_ENFORCE_GE(rank, 2,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) should have at least 2 dimensions. But "
-                          "received a %d dimension tensor.",
-                          rank));
-    PADDLE_ENFORCE_EQ(
-        dims[rank - 2], dims[rank - 1],
-        platform::errors::InvalidArgument(
-            "The inner-most 2 dimensions of Input(X) all should be symmetric "
-            "positive-definite matrices and have the same size. But received "
-            "X's shape[-2] = %d and shape[-1] = %d.",
-            dims[rank - 2], dims[rank - 1]));
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
 };
 
 class CholeskyOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -107,15 +90,10 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
+                            PT_INFER_META(phi::CholeskyInferMeta));
 REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker,
                   ops::CholeskyGradOpMaker<paddle::framework::OpDesc>,
-                  ops::CholeskyGradOpMaker<paddle::imperative::OpBase>);
+                  ops::CholeskyGradOpMaker<paddle::imperative::OpBase>,
+                  CholeskyInferShapeFunctor);
 REGISTER_OPERATOR(cholesky_grad, ops::CholeskyGradOp);
-
-REGISTER_OP_CPU_KERNEL(cholesky, ops::CholeskyCPUKernel<float>,
-                       ops::CholeskyCPUKernel<double>);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_grad,
-    ops::CholeskyGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskyGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu
deleted file mode 100644
index 43c16d607c2dbaefdcb576a07ad607f934b0f08e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cholesky_op.cu
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-
-#include <thrust/device_vector.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/cholesky_op.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CholeskyGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    bool upper = context.Attr<bool>("upper");
-    auto& dims = x->dims();
-    int batch_count = 1;
-    for (int i = 0; i < dims.size() - 2; i++) {
-      batch_count *= dims[i];
-    }
-    int m = dims[dims.size() - 1];
-    int tensor_size = batch_count * m * m;
-
-    const auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-
-    // matrices are assumed to be stored in column-major order in cusolver
-    cublasFillMode_t uplo =
-        upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
-    // portf is inplace, thus copy the triangular part of the input matrices to
-    // the output and set the other triangular part to 0 firstly
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              tensor_size);
-    if (upper) {
-      MatrixBandPartFunctor<T> matrix_band_part_functor(
-          m, m, /* num_lower_diags */ 0, /* num_upper_diags */ m, x_data,
-          out_data);
-      for_range(matrix_band_part_functor);
-    } else {
-      MatrixBandPartFunctor<T> matrix_band_part_functor(
-          m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0, x_data,
-          out_data);
-      for_range(matrix_band_part_functor);
-    }
-
-    auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
-    auto* info_ptr = reinterpret_cast<int*>(info->ptr());
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-    if (batch_count > 1) {
-      std::vector<T*> output_ptrs;
-      for (int i = 0; i < batch_count; i++) {
-        output_ptrs.emplace_back(out_data + i * m * m);
-      }
-      thrust::device_vector<T*> dev_output_ptrs(output_ptrs.begin(),
-                                                output_ptrs.end());
-      PotrfBatched(dev_ctx, uplo, m,
-                   thrust::raw_pointer_cast(dev_output_ptrs.data()), m,
-                   info_ptr, batch_count);
-      // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need
-      // to clear the upper triangle of the output. Remove this workaround once
-      // the bug is fixed.
-      if (!upper) {
-        MatrixBandPartFunctor<T> matrix_band_part_functor(
-            m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0, out_data,
-            out_data);
-        for_range(matrix_band_part_functor);
-      }
-    } else {
-#endif
-      for (int i = 0; i < batch_count; i++) {
-        Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i);
-      }
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-    }
-#endif
-    // check the info
-    std::vector<int> error_info;  // only for checking positive matrix
-    error_info.resize(batch_count);
-
-    memory::Copy(platform::CPUPlace(), error_info.data(), dev_ctx.GetPlace(),
-                 info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
-
-    for (int i = 0; i < batch_count; ++i) {
-      PADDLE_ENFORCE_EQ(error_info[i], 0,
-                        platform::errors::PreconditionNotMet(
-                            "For batch [%d]: U(%d, %d) is zero, singular U.", i,
-                            error_info[i], error_info[i]));
-    }
-  }
-
-  void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,
-             int n, T* A, int lda, int* info) const;
-
-  void PotrfBatched(const platform::CUDADeviceContext& dev_ctx,
-                    cublasFillMode_t uplo, int n, T* Aarray[], int lda,
-                    int* info_array, int batch_size) const;
-};
-
-#define FUNC_WITH_TYPES(m) m(float, S) m(double, D)
-
-#define POTRF_INSTANCE(T, C)                                                   \
-  template <>                                                                  \
-  void CholeskyGPUKernel<T>::Potrf(const platform::CUDADeviceContext& dev_ctx, \
-                                   cublasFillMode_t uplo, int n, T* A,         \
-                                   int lda, int* info) const {                 \
-    auto handle = dev_ctx.cusolver_dn_handle();                                \
-    int workspace_size = 0;                                                    \
-    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
-        platform::dynload::cusolverDn##C##potrf_bufferSize(                    \
-            handle, uplo, n, A, lda, &workspace_size));                        \
-    auto workspace = memory::Alloc(dev_ctx, workspace_size);                   \
-    T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());                 \
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrf(        \
-        handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));        \
-  }
-
-FUNC_WITH_TYPES(POTRF_INSTANCE);
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-#define POTRF_BATCH_INSTANCE(T, C)                                             \
-  template <>                                                                  \
-  void CholeskyGPUKernel<T>::PotrfBatched(                                     \
-      const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,       \
-      int n, T* Aarray[], int lda, int* info_array, int batch_size) const {    \
-    auto handle = dev_ctx.cusolver_dn_handle();                                \
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrfBatched( \
-        handle, uplo, n, Aarray, lda, info_array, batch_size));                \
-  }
-
-FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
-#endif
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cholesky, ops::CholeskyGPUKernel<float>,
-                        ops::CholeskyGPUKernel<double>);
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_grad,
-    ops::CholeskyGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskyGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/cholesky_op.h b/paddle/fluid/operators/cholesky_op.h
deleted file mode 100644
index 9504909073f7911c305ef952bca49b5b0bbca47f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cholesky_op.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <numeric>
-#include <vector>
-#include "Eigen/Cholesky"
-#include "Eigen/Core"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class CholeskyCPUKernel : public framework::OpKernel<T> {
- public:
-  // different with EigenMatrix in framework/eigen.h
-  using EigenMatrix =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using InputMatrixMap = Eigen::Map<const EigenMatrix>;
-  using OutputMatrixMap = Eigen::Map<EigenMatrix>;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    bool upper = context.Attr<bool>("upper");
-    auto& dims = x->dims();
-    int batch_count = 1;
-    for (int i = 0; i < dims.size() - 2; i++) {
-      batch_count *= dims[i];
-    }
-    auto m = dims[dims.size() - 1];
-
-    const auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    // Cholesky decomposition for each matrix, maybe can use multi threads
-    for (int i = 0; i < batch_count; i++) {
-      auto input = InputMatrixMap(x_data + i * m * m, m, m);
-      auto output = OutputMatrixMap(out_data + i * m * m, m, m);
-      if (upper) {
-        Eigen::LLT<
-            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
-            Eigen::UpLoType::Upper>
-            llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
-                          platform::errors::InvalidArgument(
-                              "Cholesky decomposition was not successful. The "
-                              "%d-th input matrice "
-                              "might not be not be positive definite.",
-                              i));
-        output = llt_decomposition.matrixU();
-      } else {
-        Eigen::LLT<
-            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
-            Eigen::UpLoType::Lower>
-            llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
-                          platform::errors::InvalidArgument(
-                              "Cholesky decomposition was not successful. The "
-                              "%d-th input matrice "
-                              "might not be not be positive definite.",
-                              i));
-        output = llt_decomposition.matrixL();
-      }
-    }
-  }
-};
-
-/*! Use these functors to implement tril, triu, diagonal and other operators */
-template <typename T>
-struct EyeFunctor {
-  EyeFunctor(const int m, const int n, T* output)
-      : m_(m), n_(n), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int global_row = index / n_;
-    const int col = index - global_row * n_;
-    const int batch = global_row / m_;
-    const int row = global_row - batch * m_;
-    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
-  }
-
-  const int m_, n_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixBandPartFunctor {
-  /*! Set output as input value outside a central band and 0 inside that band.
-   * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n]
-   * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper
-   * < 0 || (n-m) <= num_upper)
-   */
-  MatrixBandPartFunctor(const int m, const int n, const int num_lower_diags,
-                        const int num_upper_diags, const T* input, T* output)
-      : m_(m),
-        n_(n),
-        num_lower_diags_(num_lower_diags),
-        num_upper_diags_(num_upper_diags),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int col = index % n_;
-    const int row = (index / n_) % m_;
-    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
-    const int band_end =
-        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
-    if (col < band_start || col >= band_end) {
-      output_[index] = static_cast<T>(0);
-    } else {
-      output_[index] = input_[index];
-    }
-  }
-
-  const int m_, n_, num_lower_diags_, num_upper_diags_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixSetDiagFunctor {
-  /*! Overwrite specified diagonals of output by the values in diagonal.
-   * diagonals can be a central band specified by num_diags and
-   * upper_diag_index, where upper_diag_index=0 refers to the main diagonal,
-   * positive value means superdiagonal and negative value means subdiagonal.
-   * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len]
-   * and the num_diags diagonals has a up to down layout. Otherwise it has a
-   * shape [i, j, ..., max_diag_len].
-   */
-  MatrixSetDiagFunctor(const int m, const int n, const int num_diags,
-                       const int max_diag_len, const int upper_diag_index,
-                       const T* diag, T* output)
-      : m_(m),
-        n_(n),
-        num_diags_(num_diags),
-        max_diag_len_(max_diag_len),
-        upper_diag_index_(upper_diag_index),
-        diag_(diag),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int batch_and_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_diag_index * max_diag_len_;
-    const int batch = batch_and_diag_index / num_diags_;
-    const int diag_index_in_input = batch_and_diag_index - batch * num_diags_;
-    // diag_index=0 refers to the main diagonal
-    const int diag_index = upper_diag_index_ - diag_index_in_input;
-    // shift down for subdiagonal if diag_index < 0
-    const int y_index =
-        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
-    // shift right for superdiagonal if diag_index > 0
-    const int x_index =
-        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
-
-    // Upper-bound checks for diagonals shorter than max_diag_len.
-    // y_index and x_index are nonnegative by construction.
-    if (y_index < m_ && x_index < n_) {
-      const int out_index = batch * m_ * n_ + y_index * n_ + x_index;
-      output_[out_index] = diag_[index];
-    }
-  }
-
-  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
-  const T* diag_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixDiagPartFunctor {
-  /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0
-   * refers to the main diagonal, positive value means superdiagonal and
-   * negative value means subdiagonal */
-  MatrixDiagPartFunctor(const int m, const int n, const int num_diags,
-                        const int max_diag_len, const int upper_diag_index,
-                        const T padding, const T* input, T* output)
-      : m_(m),
-        n_(n),
-        num_diags_(num_diags),
-        max_diag_len_(max_diag_len),
-        upper_diag_index_(upper_diag_index),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int batch_and_mapped_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_mapped_diag_index * max_diag_len_;
-    const int batch = batch_and_mapped_diag_index / num_diags_;
-    const int mapped_diag_index =
-        batch_and_mapped_diag_index - batch * num_diags_;
-    // diag_index=0 refers to the main diagonal
-    const int diag_index = upper_diag_index_ - mapped_diag_index;
-    // shift down for subdiagonal if diag_index < 0
-    const int y_index =
-        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
-    // shift right for superdiagonal if diag_index > 0
-    const int x_index =
-        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
-    if (y_index < m_ && x_index < n_) {
-      output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index];
-    } else {
-      output_[index] = padding_;
-    }
-  }
-
-  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
-  const T padding_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixBandPartScaleEndFunctor {
-  /*! Compared with MatrixBandPartFunctor, it scale up values at the end of
-   * band. It can be used to fuse the following operations, which actually
-   * output triangular with diagonal scaled up:
-   * 1. dig = matrix_diag_part(middle)
-   * 2. middle = matrix_set_diag(middle, diag * scalar)
-   * 3. middle = matrix_band_part(middle, -1, 0)
-   */
-  MatrixBandPartScaleEndFunctor(const int m, const int n,
-                                const int num_lower_diags,
-                                const int num_upper_diags, const T scale,
-                                const T* input, T* output)
-      : m_(m),
-        n_(n),
-        num_lower_diags_(num_lower_diags),
-        num_upper_diags_(num_upper_diags),
-        scale_(scale),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int col = index % n_;
-    const int row = (index / n_) % m_;
-    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
-    const int band_end =
-        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
-    if (col < band_start || col >= band_end) {
-      output_[index] = 0;
-    } else if (col == band_end - 1) {
-      output_[index] = scale_ * input_[index];
-    } else {
-      output_[index] = input_[index];
-    }
-  }
-
-  const int m_, n_, num_lower_diags_, num_upper_diags_;
-  const T scale_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct AddtoScaleFunctor {
-  AddtoScaleFunctor(const T scale, const T* input, T* output)
-      : scale_(scale), input_(input), output_(output) {}
-  HOSTDEVICE void operator()(size_t index) const {
-    output_[index] += input_[index];
-    output_[index] *= scale_;
-  }
-  const T scale_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-class CholeskyGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Input<Tensor>("Out");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-
-    bool upper = context.Attr<bool>("upper");
-    auto& dims = out->dims();
-    int batch_count = 1;
-    for (int i = 0; i < dims.size() - 2; i++) {
-      batch_count *= dims[i];
-    }
-    auto m = dims[dims.size() - 1];
-    int tensor_size = batch_count * m * m;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    std::vector<int> axis(dims.size() - 2);
-    std::iota(axis.begin(), axis.end(), 0);
-    axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2});
-    Tensor l, l_grad;
-    if (upper) {
-      l.mutable_data<T>(dims, context.GetPlace());
-      l_grad.mutable_data<T>(dims, context.GetPlace());
-      TransCompute<DeviceContext, T>(dims.size(), dev_ctx, *out, &l, axis);
-      TransCompute<DeviceContext, T>(dims.size(), dev_ctx, *out_grad, &l_grad,
-                                     axis);
-    } else {
-      l = *out;
-      l_grad = *out_grad;
-    }
-    auto* l_data = l.data<T>();
-
-    /*！ refer to Iain Murray (2016); arXiv 1602.07527 */
-    /*! phi = matmul(L.transpose(-1, -2), grad) */
-    Tensor middle;
-    auto* middle_data = middle.mutable_data<T>(dims, context.GetPlace());
-    auto trans_desc = phi::funcs::CreateMatrixDescriptor(dims, 0, true);
-    auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(dims, 0, false);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
-    blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0));
-
-    /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */
-    platform::ForRange<DeviceContext> for_range(dev_ctx, tensor_size);
-    MatrixBandPartScaleEndFunctor<T> matrix_band_part_scale_end_functor(
-        m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0,
-        /* scale */ 0.5, middle_data, middle_data);
-    for_range(matrix_band_part_scale_end_functor);
-
-    // Compute inverse by solving the triangular linear system AX = B, where B
-    // is the identity matrix. The matrix X would be overwritten on B
-    Tensor identity;
-    auto* identity_data = identity.mutable_data<T>(dims, context.GetPlace());
-    EyeFunctor<T> eye_functor(m, m, identity_data);
-    for_range(eye_functor);
-    // TODO(guosheng): use trsmBatched for GPU
-    for (int i = 0; i < batch_count; i++) {
-      blas.TRSM(/*side*/ CblasLeft, /*uplo*/ CblasLower,
-                /*trans*/ CblasNoTrans, /*diag*/ CblasNonUnit, /*m*/ m, /*n*/ m,
-                /*alpha*/ T(1), l_data + i * m * m, /*lda*/ m,
-                identity_data + i * m * m, /*ldb*/ m);
-    }
-    Tensor& l_inverse = identity;
-
-    /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */
-    Tensor middle1;
-    middle1.mutable_data<T>(dims, context.GetPlace());
-    blas.MatMul(l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1,
-                T(0));
-    blas.MatMul(middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad,
-                T(0));
-
-    /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */
-    Tensor x_grad_trans;
-    auto* x_grad_trans_data =
-        x_grad_trans.mutable_data<T>(dims, context.GetPlace());
-    TransCompute<DeviceContext, T>(dims.size(), dev_ctx, *x_grad, &x_grad_trans,
-                                   axis);
-    AddtoScaleFunctor<T> addto_scale_functor(0.5, x_grad_trans_data,
-                                             x_grad_data);
-    for_range(addto_scale_functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
index 86ed7574654959849beb0c1d547a736ad9e1546c..f25fbbb0c698036951c4b9ae8e9ad2778786a1a2 100644
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -203,7 +203,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
       commonterm_conj = helper.Transpose(commonterm_conj);
 
       phi::AddRawKernel<T>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE &>(dev_ctx),
           commonterm, commonterm_conj, -1, &commonterm);
 
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index a2fc080faadcf9c24ccc703524cd71da92ce7cdb..f1247ebdf23c8e00cdbfd662a160912a769d7558 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -1,13 +1,13 @@
 include(operators)
 
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
-cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope cinn)
+cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn)
 
 SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
 register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
-  cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope cinn_launch_context)
+  cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope proto_desc graph cinn_launch_context cinn_instruction_run_op cinn)
   set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
   SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda")
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 0b677f79f7f5d7b9a4a9b2627890e1a42745113a..0a21d937aa1a70120e6112cdb291aa41eb222bb3 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -17,22 +17,39 @@
 #include <functional>
 #include <utility>
 #include <vector>
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
 #include "cinn/hlir/framework/tensor.h"
 #include "cinn/runtime/cinn_runtime.h"
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators::details {
 
-using LoDTensor = framework::LoDTensor;
+using framework::Scope;
+using framework::LoDTensor;
+using framework::ParallelExecutor;
+using CinnInstruction = ::cinn::hlir::framework::Instruction;
+using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
+using framework::paddle2cinn::Name2VarInfoMap;
+using framework::paddle2cinn::kMemOptVarInfoFromMainGraph;
 
-CinnLaunchContext::CinnLaunchContext(
-    const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
-    const std::shared_ptr<CinnScope>& cinn_scope)
-    : cinn_scope_(cinn_scope) {
-  // generate all names of the cinn execution arguments
+CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
+                                     const CinnCompiledObject& compiled_obj)
+    : cinn_scope_(compiled_obj.scope) {
+  // collect all names of the CINN execution arguments
   auto var_names = cinn_scope_->var_names();
   cinn_argument_names_.reserve(var_names.size());
   std::transform(
@@ -40,7 +57,42 @@ CinnLaunchContext::CinnLaunchContext(
       std::inserter(cinn_argument_names_, cinn_argument_names_.end()),
       [](const auto& name_view) { return std::string(name_view.data()); });
   // build name map between the original variables and compiled ones
-  BuildVarNameMap(paddle2cinn_varmap, cinn_argument_names_);
+  BuildVarNameMap(compiled_obj.paddle2cinn_varmap, cinn_argument_names_);
+
+  const auto& input_var_names =
+      graph.Get<std::vector<std::string>>(framework::paddle2cinn::kInputVars);
+  const auto& output_var_names =
+      graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
+  internal_var_names_ =
+      ExtractInternalVarNames(input_var_names, output_var_names);
+  // check completeness of output variables in compiled result
+  for (auto&& var_name : output_var_names) {
+    PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
+                      platform::errors::PreconditionNotMet(
+                          "Variable(%s) not applied in CINN", var_name));
+  }
+
+  // initialize all execution arguments
+  InitializeArguments();
+  // DEPRECATED(CtfGo): following callback assignment will be deprecated soon
+  for (auto&& var_name : input_var_names) {
+    if (IsVariableUsed(var_name)) {
+      AssignExternalVariable(var_name);
+    }
+  }
+  for (auto&& var_name : output_var_names) {
+    AssignExternalVariable(var_name);
+  }
+  for (auto&& var_name : internal_var_names_) {
+    AssignInternalVariable(var_name);
+  }
+
+  // Convert the CINN runtime program to a Paddle graph
+  runtime_graph_ = std::make_unique<framework::ir::Graph>(
+      BuildCompiledProgram(graph, compiled_obj));
+  runtime_graph_->SetNotOwned<Name2VarInfoMap>(
+      kMemOptVarInfoFromMainGraph,
+      &graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph));
 }
 
 void CinnLaunchContext::BuildVarNameMap(
@@ -94,21 +146,15 @@ void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
           << std::addressof(place);
 }
 
-bool CinnLaunchContext::IsArgumentsInitialized() const {
-  if (hold_buffers_.empty() || name2argument_.empty()) {
-    return false;
-  }
-  return true;
-}
-
 bool CinnLaunchContext::IsVariableUsed(const std::string& var_name) const {
   return paddle2cinn_varmap_.count(var_name) > 0;
 }
 
-CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& arg_name) {
-  PADDLE_ENFORCE_GT(cinn_argument_names_.count(arg_name), 0,
-                    platform::errors::InvalidArgument(
-                        "Variable(%s) not found in cinn scope.", arg_name));
+CinnTensor CinnLaunchContext::GetCinnTensorOfVar(const std::string& var_name) {
+  PADDLE_ENFORCE_EQ(
+      IsVariableUsed(var_name), true,
+      platform::errors::NotFound("Variable(%s) not applied in CINN", var_name));
+  const auto& arg_name = paddle2cinn_varmap_.at(var_name);
   return cinn_scope_->GetTensor(arg_name);
 }
 
@@ -132,10 +178,13 @@ std::unordered_set<std::string> CinnLaunchContext::ExtractInternalVarNames(
   return remain_var_names;
 }
 
-void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name,
-                                              const LoDTensor& paddle_tensor,
-                                              const CinnTensor& cinn_tensor) {
+void CinnLaunchContext::CheckTensorEquivalent(
+    const std::string& var_name, const framework::LoDTensor& paddle_tensor) {
+  PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
+                    platform::errors::InvalidArgument(
+                        "Variable(%s) not applied in cinn", var_name));
   // check dimension
+  auto cinn_tensor = GetCinnTensorOfVar(var_name);
   auto cinn_dims = phi::make_ddim(cinn_tensor->shape().data());
   PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
                     platform::errors::PreconditionNotMet(
@@ -146,22 +195,28 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name,
   // TODO(CtfGo): check the underlying data type after CINN ready
 }
 
+void CinnLaunchContext::InitializeArguments() {
+  for (auto&& arg : cinn_argument_names_) {
+    auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+    auto cinn_tensor = GetCinnTensorOfVar(cinn2paddle_varmap_.at(arg));
+    // assign dimensions with corresponding compiled tensor
+    cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                        cinn_tensor->shape().data().size());
+    VLOG(4) << string::Sprintf(
+        "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg,
+        framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
+        name2argument_.size());
+    name2argument_.emplace(arg, cinn_buffer.get());
+    hold_buffers_.emplace_back(std::move(cinn_buffer));
+  }
+}
+
 void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
-  const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name);
-
-  const auto& paddle_tensor = cached_scope_->GetVar(var_name)->Get<LoDTensor>();
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name);
-  if (paddle_tensor.IsInitialized()) {
-    CheckTensorEquivalent(var_name, paddle_tensor, cinn_tensor);
-  }
-
-  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign dimensions and alloc/free callback of cinn_buffer_t
-  cinn_buffer->resize(cinn_tensor->shape().data().data(),
-                      cinn_tensor->shape().data().size());
+  auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
       [this, var_name](void* ctx, cinn_buffer_t* buffer) {
         auto* tensor = cached_scope_->GetVar(var_name)->GetMutable<LoDTensor>();
@@ -177,22 +232,14 @@ void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
         // Do nothing
         return 0;
       });
-
-  return AppendArgument(cinn_arg_name, std::move(cinn_buffer));
 }
 
 void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
-  const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name);
-
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name);
-  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign dimensions and alloc/free callback of cinn_buffer_t
-  cinn_buffer->resize(cinn_tensor->shape().data().data(),
-                      cinn_tensor->shape().data().size());
-
+  auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
       [this, var_name](void* ctx, cinn_buffer_t* buffer) {
         auto* tensor =
@@ -212,30 +259,106 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
         tensor->clear();
         return 0;
       });
-  return AppendArgument(cinn_arg_name, std::move(cinn_buffer));
 }
 
-void CinnLaunchContext::AppendArgument(
-    const std::string& arg_name, std::unique_ptr<cinn_buffer_t>&& buffer) {
-  name2argument_.emplace(arg_name, buffer.get());
-  hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << string::Sprintf(
-      "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg_name,
-      framework::DDim(buffer->dims, buffer->dimensions).to_str(),
-      name2argument_.size());
+framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
+    const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) {
+  CinnRuntimeProgram* runtime_program = compiled_obj.runtime_program.get();
+  // Step 0: Create an empty program_desc, there will be only one block
+  framework::ProgramDesc program_desc;
+  auto* block = program_desc.MutableBlock(0);
+  const std::vector<std::unique_ptr<CinnInstruction>>& instructions =
+      runtime_program->GetRunInstructions();
+
+  // build a map that links the name of a Paddle variable to its VarDesc
+  const std::unordered_set<framework::ir::Node*>& nodes = graph.Nodes();
+  std::unordered_map<std::string, framework::VarDesc*> original_vardescs;
+  for (auto* node : nodes) {
+    if (node->IsVar() && node->Var()) {
+      original_vardescs.emplace(node->Name(), node->Var());
+    }
+  }
+
+  // Step 1: Create a VarDesc for each execution argument:
+  //   (1) For those variables that are input or output variables of the
+  //   original subgraph, there must exist an original VarDesc, so
+  //   we copy some useful info(such as IsParameter,Persistable)
+  //   to the new VarDesc.
+  //   (2) For all variables, the shape, data type of their VarDescs
+  //   are set by values of the corresponding compiled tensors,
+  //   including the in/out variables where the equiality between their tensors
+  //   and the CINN compiled ones is verified in corresponding cinn_launch_op.
+  for (auto&& arg : cinn_argument_names_) {
+    const std::string& var_name = cinn2paddle_varmap_.at(arg);
+    framework::VarDesc* var_desc = block->Var(var_name);
+    var_desc->SetType(framework::proto::VarType::LOD_TENSOR);
+
+    auto res = original_vardescs.find(var_name);
+    if (res != original_vardescs.end()) {
+      auto* ori_desc = res->second;
+      var_desc->SetPersistable(ori_desc->Persistable());
+      var_desc->SetIsParameter(ori_desc->IsParameter());
+    }
+
+    auto cinn_tensor = GetCinnTensorOfVar(var_name);
+    // TODO(CtfGo): set the corresponding data type after CINN ready,
+    //              currently set as FP32 in default
+    var_desc->SetDataType(framework::proto::VarType::FP32);
+    var_desc->SetShape(std::vector<int64_t>(cinn_tensor->shape().data().begin(),
+                                            cinn_tensor->shape().data().end()));
+  }
+
+  // transform names of the input or output arguments of a CINN instruction
+  // to the corresponding Paddle variable names, and repack them as one vector
+  auto trans_and_pack_args_fn =
+      [this](const std::vector<std::vector<std::string>>& cinn_args_array) {
+        std::vector<std::string> var_names;
+        for (auto&& cinn_args : cinn_args_array) {
+          for (auto&& arg : cinn_args) {
+            auto res = cinn2paddle_varmap_.find(arg);
+            PADDLE_ENFORCE_NE(
+                res, cinn2paddle_varmap_.end(),
+                platform::errors::NotFound("Argument(%s) not found", arg));
+            var_names.emplace_back(res->second);
+          }
+        }
+        return var_names;
+      };
+
+  // Step 2: create a VarDesc of cinn_instruction_run op for
+  //         each CINN instruction and append it to the main block
+  for (auto ins_idx = 0; ins_idx < instructions.size(); ++ins_idx) {
+    auto* ins = instructions.at(ins_idx).get();
+    auto in_args = trans_and_pack_args_fn(ins->GetInArgs());
+    auto out_args = trans_and_pack_args_fn(ins->GetOutArgs());
+
+    auto* op_desc = block->AppendOp();
+    op_desc->SetType("cinn_instruction_run");
+    op_desc->SetInput(kX, in_args);
+    op_desc->SetOutput(kOutputs, out_args);
+    op_desc->SetAttr(kCachedIndex,
+                     {static_cast<int64_t>(compiled_obj.cached_index)});
+    op_desc->SetAttr(kInstructionIndex, {static_cast<int64_t>(ins_idx)});
+  }
+
+  return program_desc;
 }
 
-const std::map<std::string, cinn_pod_value_t>&
-CinnLaunchContext::FinalizeArguments() const {
-  // Check all execution parameters are assigned valued.
-  std::for_each(cinn_argument_names_.begin(), cinn_argument_names_.end(),
-                [this](const auto& arg_name) {
-                  PADDLE_ENFORCE_GT(
-                      name2argument_.count(arg_name), 0,
-                      platform::errors::NotFound(
-                          "Argument(%s) is missed for execution", arg_name));
-                });
-  return name2argument_;
+ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
+                                                  framework::Scope* scope) {
+  if (!parallel_executor_) {
+    framework::details::ExecutionStrategy exec_strategy;
+    framework::details::BuildStrategy build_strategy;
+    parallel_executor_ = std::make_unique<ParallelExecutor>(
+        place, scope, exec_strategy, build_strategy, runtime_graph_.get());
+  }
+
+  // update the scope bound to an OpHandle and rebuild temporary variables
+  std::unordered_map<Scope*, Scope*> scope_map = {
+      {parallel_executor_->GetLocalScopes().front(), scope}};
+  parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
+  parallel_executor_->PrepareVariables(scope);
+  return parallel_executor_.get();
 }
 
 cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar(
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index 502e6a92dc10bba4a39bef0a493f8c5deb7eeb71..a4d613ea618a886d99344a34ad80aa02e88c10e7 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -21,7 +21,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -35,10 +35,25 @@ class Program;
 }  // namespace cinn::hlir::framework
 
 namespace paddle {
+namespace framework {
+class ProgramDesc;
+class Scope;
+class VarDesc;
+
+namespace ir {
+class Graph;
+}  // namespace ir
+
+namespace paddle2cinn {
+class CinnCompiledObject;
+}  // namespace paddle2cinn
+}  // namespace framework
+
 namespace operators::details {
 
 using CinnTensor = ::cinn::hlir::framework::Tensor;
 using CinnScope = ::cinn::hlir::framework::Scope;
+using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
 
 // This class is used to cache some reusable data among repeated
 // executions for efficiency and it also provides easy interfaces
@@ -49,58 +64,71 @@ using CinnScope = ::cinn::hlir::framework::Scope;
 // Variable while a CINN variable is called an Argument.
 class CinnLaunchContext {
  public:
-  explicit CinnLaunchContext(
-      const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
-      const std::shared_ptr<CinnScope>& cinn_scope);
+  explicit CinnLaunchContext(const framework::ir::Graph& graph,
+                             const CinnCompiledObject& compiled_obj);
+
+  // Initialize a ParallelExecutor to execute the runtime graph,
+  // it will be constructed in the first call, and just update
+  // the execution scope in the following usage.
+  framework::ParallelExecutor* InitializePE(const platform::Place& place,
+                                            framework::Scope* scope);
 
   // explicitly update several environment variables captured
   // by callback of execution arguments
   void UpdateCapturedEnv(const framework::Scope& scope,
                          const platform::Place& place);
 
-  // Return whether execution arguments has been initialized
-  bool IsArgumentsInitialized() const;
-
   // Return whether a Paddle variable used in cinn execution
   bool IsVariableUsed(const std::string& var_name) const;
 
-  // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name);
-
-  // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name);
+  // Check the equiality in type and dimension between the tensor
+  // in Paddle and the compiled tensor returned by CINN of a same variable
+  void CheckTensorEquivalent(const std::string& var_name,
+                             const framework::LoDTensor& paddle_tensor);
 
-  // Extract internal variable names from all applied variables
-  // in execution by excluding the input and output variables
-  std::unordered_set<std::string> ExtractInternalVarNames(
-      const std::vector<std::string>& input_var_names,
-      const std::vector<std::string>& output_var_names);
+  // Return internal variable names list
+  const std::unordered_set<std::string>& GetInternalVarNames() const {
+    return internal_var_names_;
+  }
 
   // Finalize all execution arguments and return the name->argument map
-  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
+  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const {
+    return name2argument_;
+  }
 
   // Return the cinn_buffer_t* of a specific variable
   cinn_buffer_t* GetCinnBufferOfVar(const std::string& var_name);
 
  private:
-  // Get CinnTensor with CINN argument name
-  CinnTensor GetCinnTensor(const std::string& arg_name);
+  // Get corresponding compiled tensor of a Paddle variable name
+  CinnTensor GetCinnTensorOfVar(const std::string& var_name);
+
   // Build the name maps of paddle->cinn and cinn->paddle
   // in reverse for all variables used in cinn execution
   void BuildVarNameMap(
       const std::unordered_map<std::string, std::string>& compiled_varmap,
       const std::unordered_set<std::string>& argument_names);
 
-  // Check whether the tensor in Paddle and the compiled
-  // tensor returned by CINN of a same variable
-  // are equivalent in type and dimension
-  void CheckTensorEquivalent(const std::string& var_name,
-                             const framework::LoDTensor& paddle_tensor,
-                             const CinnTensor& cinn_tensor);
+  // Extract internal variable names from all applied variables
+  // in execution by excluding the input and output variables
+  std::unordered_set<std::string> ExtractInternalVarNames(
+      const std::vector<std::string>& input_var_names,
+      const std::vector<std::string>& output_var_names);
+
+  // Initialize each execution argument with a cinn_buffer_t
+  void InitializeArguments();
 
-  // Append an argument with (cinn name)->(cinn_buffer_t) pair
-  void AppendArgument(const std::string& arg_name,
-                      std::unique_ptr<cinn_buffer_t>&& buffer);
+  // Assign tensor buffer to input or output variables
+  void AssignExternalVariable(const std::string& var_name);
+
+  // Assign tensor buffer to internal variables
+  void AssignInternalVariable(const std::string& var_name);
+
+  // Construct a Paddle ProgramDesc with the CINN runtime
+  // instructions included in the compiled CINN Program
+  framework::ProgramDesc BuildCompiledProgram(
+      const framework::ir::Graph& graph,
+      const CinnCompiledObject& compiled_obj);
 
  private:
   const framework::Scope* cached_scope_ = nullptr;
@@ -111,16 +139,22 @@ class CinnLaunchContext {
   std::unordered_map<std::string, std::string> paddle2cinn_varmap_;
   // a name map from cinn execution arguments to paddle variables
   std::unordered_map<std::string, std::string> cinn2paddle_varmap_;
+  // a list of internal variable names in Paddle
+  std::unordered_set<std::string> internal_var_names_;
   // the names of the cinn arguments used in compiled executable program
   std::unordered_set<std::string> cinn_argument_names_;
   // the variable scope compiled from cinn
   const std::shared_ptr<CinnScope> cinn_scope_;
 
+  // the ir::Graph object converted from the program compiled by CINN
+  std::unique_ptr<framework::ir::Graph> runtime_graph_;
+  // a ParallelExecutor to execute the runtime graph
+  std::unique_ptr<framework::ParallelExecutor> parallel_executor_;
+
   // because a cinn_pod_value_t does not own a cinn_buffer_t object,
   // an extra stroage is necessary to keep those objects and they can
   // not be released until the runtime program finish execution.
   std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers_;
-
   // this map saves all execution arguments with their cinn names as key,
   // and it is passed to the Execute interface of a cinn runtime program.
   std::map<std::string, cinn_pod_value_t> name2argument_;
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index 58a9c5db712b9ae90f7a3bb486266b61e386d591..4976a59d1dd3829b637f18b3c815e4d2fc9c7526 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -13,87 +13,229 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+#include <memory>
+#include <set>
+#include <utility>
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
 #include "cinn/hlir/framework/tensor.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/phi/core/ddim.h"
 
+USE_OP(cinn_instruction_run);
 namespace paddle {
 namespace operators::details {
 
-using LoDTensor = framework::LoDTensor;
+using framework::OpDesc;
+using framework::ProgramDesc;
+using framework::LoDTensor;
+using framework::ir::Graph;
+using framework::ParallelExecutor;
+using framework::paddle2cinn::Name2VarInfoMap;
 using CinnShape = ::cinn::hlir::framework::Shape;
+using CinnInstruction = ::cinn::hlir::framework::Instruction;
+using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
 
-std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
+const Graph& InitDefaultSubgraph() {
   static std::once_flag initialized;
-  static std::unordered_map<std::string, std::string> paddle2cinn_varmap;
-  static std::shared_ptr<CinnScope> cinn_scope;
-  std::call_once(initialized, [&paddle2cinn_varmap, &cinn_scope]() {
-    auto& scope = cinn_scope;
-    scope = std::make_shared<CinnScope>();
+  static std::unique_ptr<Graph> graph;
+  std::call_once(initialized, [&]() {
+    ProgramDesc program;
+    auto* block = program.MutableBlock(0);
+    auto* var1 = block->Var("var1");
+    var1->SetPersistable(true);
+    block->Var("var2");
+    block->Var("var3");
+    block->Var("var4");
+    auto* var5 = block->Var("var5");
+    var5->SetIsParameter(true);
+    auto add_op = std::unique_ptr<OpDesc>(
+        new OpDesc("elementwise_add", {{"X", {"var1"}}, {"Y", {"var2"}}},
+                   {{"Out", {"var3"}}}, {}));
+    block->AppendAllocatedOp(std::move(add_op));
+    auto mul_op = std::unique_ptr<OpDesc>(new OpDesc(
+        "mul", {{"X", {"var1"}}, {"Y", {"var2"}}}, {{"Out", {"var4"}}}, {}));
+    block->AppendAllocatedOp(std::move(mul_op));
+    auto res_op = std::unique_ptr<OpDesc>(
+        new OpDesc("elementwise_add", {{"X", {"var3"}}, {"Y", {"var4"}}},
+                   {{"Out", {"var5"}}}, {}));
+    block->AppendAllocatedOp(std::move(res_op));
+    graph = std::make_unique<Graph>(program);
+
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kInputVars,
+        new std::vector<std::string>({"var1", "var2"}));
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kInternalVars,
+        new std::vector<std::string>({"var3", "var4"}));
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kOutputVars,
+        new std::vector<std::string>({"var5"}));
+    graph->GetOrInit<Name2VarInfoMap>(
+        framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
+  });
+  return *graph.get();
+}
 
+CinnCompiledObject* InitDefaultCompiledObject() {
+  static std::once_flag initialized;
+  static auto compiled_obj = std::make_unique<CinnCompiledObject>();
+  std::call_once(initialized, [result = compiled_obj.get()]() {
+    auto& scope = result->scope;
+    scope = std::make_shared<CinnScope>();
     scope->Var<CinnTensor>("cinn_var1");
     scope->GetTensor("cinn_var1")->Resize(CinnShape({3, 4}));
     scope->Var<CinnTensor>("cinn_var2");
     scope->GetTensor("cinn_var2")->Resize(CinnShape({6, 7, 8}));
     scope->Var<CinnTensor>("cinn_var3");
     scope->GetTensor("cinn_var3")->Resize(CinnShape({10, 16}));
+    scope->Var<CinnTensor>("cinn_var4");
+    scope->GetTensor("cinn_var4")->Resize(CinnShape({10, 16}));
+    scope->Var<CinnTensor>("cinn_var5");
+    scope->GetTensor("cinn_var5")->Resize(CinnShape({10, 16}));
 
-    paddle2cinn_varmap = {
-        {"var1", "cinn_var1"}, {"var3", "cinn_var3"}, {"var4", "cinn_var4"}};
+    // input variables: var1, var2; output: var5
+    // internal variables: var3 and var4, here var3 is retained
+    // in result map, so the name will be used neither cinn_var3
+    auto& paddle2cinn_varmap = result->paddle2cinn_varmap;
+    paddle2cinn_varmap = {{"var1", "cinn_var1"},
+                          {"var2", "cinn_var2"},
+                          {"var3", "cinn_var3"},
+                          {"var5", "cinn_var5"}};
+
+    auto& runtime_program = result->runtime_program;
+    std::vector<std::unique_ptr<CinnInstruction>> instructions;
+    instructions.emplace_back(new CinnInstruction(
+        cinn::common::DefaultHostTarget(), scope.get(),
+        {"cinn_var1", "cinn_var2"}, {"cinn_var3"}, "elementwise_add"));
+    instructions.emplace_back(
+        new CinnInstruction(cinn::common::DefaultHostTarget(), scope.get(),
+                            {"cinn_var1", "cinn_var2"}, {"cinn_var4"}, "mul"));
+    instructions.emplace_back(new CinnInstruction(
+        cinn::common::DefaultHostTarget(), scope.get(),
+        {"cinn_var3", "cinn_var4"}, {"cinn_var5"}, "elementwise_add"));
+    runtime_program =
+        std::make_unique<CinnRuntimeProgram>(scope, std::move(instructions));
+    result->cached_index = 110;
   });
 
-  return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
+  return compiled_obj.get();
 }
 
-TEST(CinnLaunchContextTest, TestBasic) {
-  auto launch_context = CreateDefaultLaunchContext();
-  // test IsVariableUsed
+class CinnLaunchContextTest : public ::testing::Test {
+ public:
+  std::unique_ptr<CinnLaunchContext> launch_context;
+  CinnCompiledObject* compiled_obj;
+
+  void SetUp() override {
+    compiled_obj = InitDefaultCompiledObject();
+    launch_context = std::make_unique<CinnLaunchContext>(InitDefaultSubgraph(),
+                                                         *compiled_obj);
+  }
+};
+
+TEST_F(CinnLaunchContextTest, TestConstructResult) {
   ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
+  ASSERT_EQ(launch_context->IsVariableUsed("var2"), true);
+  ASSERT_EQ(launch_context->IsVariableUsed("var3"), true);
   ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-  // test UpdateCapturedEnv
-  platform::CPUPlace place;
-  framework::Scope scope;
-  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
-  // test IsArgumentsInitialized
-  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
+  ASSERT_EQ(launch_context->IsVariableUsed("var5"), true);
+
+  // check result of ExtractInternalVarNames
+  ASSERT_EQ(launch_context->GetInternalVarNames(),
+            std::unordered_set<std::string>({"var3", "cinn_var4"}));
+
+  // check completeness of arguments list, and also check
+  // the two name maps of the paddle->cinn and the reverse one
+  // through the IsVariableUsed interface
+  auto&& arguments = launch_context->FinalizeArguments();
+  ASSERT_EQ(arguments.size(), 5);
+  auto check_argument_fn = [&arguments, this](const std::string& var_name,
+                                              const std::string& arg_name) {
+    ASSERT_EQ(launch_context->IsVariableUsed(var_name), true);
+    ASSERT_NO_THROW(launch_context->GetCinnBufferOfVar(var_name));
+    ASSERT_GT(arguments.count(arg_name), 0);
+    EXPECT_EQ(launch_context->GetCinnBufferOfVar(var_name),
+              static_cast<cinn_buffer_t*>(arguments.at(arg_name)));
+    auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
+    auto&& scope = compiled_obj->scope;
+    ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
+              phi::make_ddim(scope->GetTensor(arg_name)->shape().data()));
+  };
+  check_argument_fn("var1", "cinn_var1");
+  check_argument_fn("var2", "cinn_var2");
+  check_argument_fn("var3", "cinn_var3");
+  check_argument_fn("cinn_var4", "cinn_var4");
+  check_argument_fn("var5", "cinn_var5");
 }
 
-TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
+TEST_F(CinnLaunchContextTest, TestCheckTensorEquivalent) {
   platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
   launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
 
   // CheckTensorEquivalent: tensor dimension not equivalent
   tensor1->mutable_data<float>(phi::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
+  ASSERT_THROW(launch_context->CheckTensorEquivalent("var1", *tensor1),
                paddle::platform::EnforceNotMet);
 }
 
-TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
+TEST_F(CinnLaunchContextTest, TestBuildCompiledProgram) {
   platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
-  launch_context->UpdateCapturedEnv(scope, place);
-  auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
+  ParallelExecutor* pe = nullptr;
+  ASSERT_NO_THROW((pe = launch_context->InitializePE(place, &scope)));
 
-  // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
-               paddle::platform::EnforceNotMet);
-  // not found
-  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
-               paddle::platform::EnforceNotMet);
+  // check details of program build by compiled instructions
+  const ProgramDesc& program = pe->Graph().OriginProgram();
+  ASSERT_EQ(program.Size(), 1);
+  const auto& block = program.Block(0);
+  // vars
+  std::set<std::string> var_names = block.LocalVarNames();
+  ASSERT_EQ(var_names.size(), 5);
+  for (auto&& var_name : var_names) {
+    auto* var = block.FindVar(var_name);
+    ASSERT_NE(var, nullptr);
+    auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
+    ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
+              phi::make_ddim(var->GetShape()));
+  }
+  ASSERT_TRUE(block.FindVar("var1")->Persistable());
+  ASSERT_FALSE(block.FindVar("var5")->Persistable());
+  ASSERT_TRUE(block.FindVar("var5")->IsParameter());
+  ASSERT_FALSE(block.FindVar("var1")->IsParameter());
+  // ops
+  ASSERT_EQ(block.OpSize(), 3);
+  auto* op1 = block.Op(0);
+  ASSERT_EQ(op1->Type(), "cinn_instruction_run");
+  ASSERT_EQ(op1->Input(kX), std::vector<std::string>({"var1", "var2"}));
+  ASSERT_EQ(op1->Output(kOutputs), std::vector<std::string>({"var3"}));
+  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kCachedIndex), 110);
+  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kInstructionIndex), 0);
+  auto* op3 = block.Op(2);
+  ASSERT_EQ(op3->Type(), "cinn_instruction_run");
+  ASSERT_EQ(op3->Input(kX), std::vector<std::string>({"var3", "cinn_var4"}));
+  ASSERT_EQ(op3->Output(kOutputs), std::vector<std::string>({"var5"}));
+  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kCachedIndex), 110);
+  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kInstructionIndex), 2);
 }
 
-TEST(CinnLaunchContextTest, TestAppendArgument) {
-  platform::CPUPlace cpu_place;
-  platform::Place place(cpu_place);
+// DEPRECATED(CtfGo): following test of callback assignment
+// will be deprecated after we switch to pe
+TEST_F(CinnLaunchContextTest, TestCallbackAssignment) {
+  platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
   launch_context->UpdateCapturedEnv(scope, place);
 
   // assign external variables
@@ -101,33 +243,8 @@ TEST(CinnLaunchContextTest, TestAppendArgument) {
   float* data1 = tensor1->mutable_data<float>(phi::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));
-
-  auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
-  tensor3->mutable_data<float>(phi::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
-
-  // FinalizeArguments missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
-  // test get internal variables
-  auto internal_variable_names =
-      launch_context->ExtractInternalVarNames({"var1"}, {"var3"});
-  ASSERT_EQ(internal_variable_names.size(), 1);
-  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
-
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(phi::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
-
   // check argument is set correctly and alloc/free callbacks work well
-  auto name2argument = launch_context->FinalizeArguments();
-  ASSERT_EQ(name2argument.size(), 3);
-  ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
-
-  auto* cinn_buffer =
-      static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
+  auto* cinn_buffer = launch_context->GetCinnBufferOfVar("var1");
   ASSERT_EQ(cinn_buffer->memory, nullptr);
   cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
   ASSERT_NE(cinn_buffer->memory, nullptr);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 1db9f2f25e270fa61309f3d2e2522b37c73992f4..cf3b98c6679b80acad8da69c91addadb9f66ce44 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -105,63 +105,29 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     auto* launch_context = cinn_compiled_object.launch_context.get();
     // Step 3. Prepare arguments needed for the compiled executable program.
     launch_context->UpdateCapturedEnv(scope, place);
-    if (!launch_context->IsArgumentsInitialized()) {
-      VLOG(4) << "CinnLaunchOp prepare arguments";
-
-      // 3.1 Prepare input variables: tensors of input variables have
-      //     been initialized before graph compiled, just check the
-      //     equiality between tensors of paddle and cinn.
-      for (const auto& var_name : input_no_need_buffer_variable_names) {
-        // the input variable declared as 'no need buffer' can not be used
-        PADDLE_ENFORCE_EQ(
-            launch_context->IsVariableUsed(var_name), false,
-            platform::errors::InvalidArgument(
-                "Input variable(%s) should not be used by cinn in execution",
-                var_name));
-      }
-
-      for (const auto& var_name : input_x_variable_names) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        if (!launch_context->IsVariableUsed(var_name)) {
-          VLOG(4) << "Input variable" << var_name << " not used by cinn";
-          continue;
-        }
-
-        launch_context->AssignExternalVariable(var_name);
-      }
-
-      // 3.2 Prepare output variables: all output variables should
-      //     be initialized and allocated buffer before
-      //     the runtime program start execution, the compilation result
-      //     includes details of their buffer assginment and we use that to
-      //     allocate space in Paddle. For those variables allocated yet,
-      //     like persistable parameters, just check the equiality between
-      //     Paddle allocation and CINN buffer assginment.
-      auto output_variable_names = ctx.OutputNames(kOutputs);
-      for (const auto var_name : output_variable_names) {
-        PADDLE_ENFORCE_EQ(
-            launch_context->IsVariableUsed(var_name), true,
-            platform::errors::InvalidArgument(
-                "Output variable(%s) not used by cinn", var_name));
-
-        launch_context->AssignExternalVariable(var_name);
-      }
-
-      // 3.3 Prepare internal or temporary variables: Create a temporary
-      //     scope to keep internal variables within graph or temporary
-      //     variables needed by the compiled runtime program in addition.
-      //     Here we directly use the names from CinnScope as Paddle variable
-      //     names, because they will not be used outside the graph
-      //     and should be destructed after computation finished.
-      auto internal_variable_names = launch_context->ExtractInternalVarNames(
-          input_x_variable_names, output_variable_names);
-      for (const auto& var_name : internal_variable_names) {
-        launch_context->AssignInternalVariable(var_name);
+    // 3.1 Input variables: tensors of input variables have
+    //     been initialized before graph compiled, just check the
+    //     equiality between tensors of paddle and cinn.
+    for (const auto& var_name : input_x_variable_names) {
+      // some input variables don't need for cinn because they are
+      // eliminated by optimized passes or some cinn operators use
+      // less variables
+      if (!launch_context->IsVariableUsed(var_name)) {
+        VLOG(4) << "Input variable" << var_name << " not used by cinn";
+        continue;
       }
+      launch_context->CheckTensorEquivalent(var_name,
+                                            *inputs_name2tensor.at(var_name));
     }
 
+    // 3.2 Output variables: the output variables will be initialized
+    //     and allocated buffer in callbacks which are defined in the
+    //     external_malloc/free interface of cinn_buffer_t
+    //     in their corresponding arguments.
+    // 3.3 Internal variables: A temporary scope is created in
+    //     UpdateCapturedEnv to keep the internal variables and
+    //     they are also initialized through callbacks
+
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h
index eb3d725d554b1c522cc87d031ee82c1700dc06a0..9720a5309fa6e1ce0316f709d347599fa125f507 100644
--- a/paddle/fluid/operators/cinn/test_helper.h
+++ b/paddle/fluid/operators/cinn/test_helper.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -31,6 +32,7 @@ using LoDTensor = framework::LoDTensor;
 using Variable = framework::Variable;
 using Graph = framework::ir::Graph;
 using Node = framework::ir::Node;
+using framework::paddle2cinn::Name2VarInfoMap;
 
 std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
     const std::string& x_name, const std::string& y_name,
@@ -71,6 +73,16 @@ std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
   y_node->inputs = {feed_op_node_y};
   y_node->outputs = {elementwise_add_node};
   out_node->inputs = {elementwise_add_node};
+  // set necessary attributes
+  g->Set<std::vector<std::string>>(
+      framework::paddle2cinn::kInputVars,
+      new std::vector<std::string>({x_name, y_name}));
+  g->Set<std::vector<std::string>>(framework::paddle2cinn::kInternalVars,
+                                   new std::vector<std::string>({}));
+  g->Set<std::vector<std::string>>(framework::paddle2cinn::kOutputVars,
+                                   new std::vector<std::string>({out_name}));
+  g->GetOrInit<Name2VarInfoMap>(
+      framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
   return g;
 }
 
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 9f27e2238c9c832e62d6de93798b7fab20592a4c..900fd4d8d292e3c4a8884957dceeaa020ee0003e 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -54,7 +54,7 @@ struct FillConstantVisitor {
                  * = nullptr) const {
 #ifdef PADDLE_WITH_ASCEND_CL
     if (platform::is_npu_place(dev_ctx_.GetPlace())) {
-      Tensor tensor_tmp(framework::TransToPtenDataType(dtype_));
+      Tensor tensor_tmp(framework::TransToPhiDataType(dtype_));
       tensor_tmp.mutable_data<T>({1}, context_.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value_));
 
@@ -194,7 +194,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
     void *fused_tensor_ptr =
         fused_tensor->Resize(phi::make_ddim({static_cast<int64_t>(numel)}))
             .mutable_data(context.GetPlace(),
-                          framework::TransToPtenDataType(dtype));
+                          framework::TransToPhiDataType(dtype));
     VLOG(10) << "Fused tensor addr " << fused_tensor_ptr;
 
     // Init the continuous space
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index a04935d43eb2d8ab36749f0d2a35b09552001e7c..7e5120cd2b392b1eb0698727ccebac485193f6d9 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -23,8 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \
+    defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif
 
@@ -45,6 +46,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_ASCEND_CL)
 DECLARE_bool(hccl_check_nan);
 #endif
@@ -398,6 +403,65 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->type()));
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<T>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
+
+    mluStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    cnclReduceOp_t cncl_red_type = cnclSum;
+    switch (red_type) {
+      case kRedSum:
+        cncl_red_type = cnclSum;
+        break;
+
+      case kRedMax:
+        cncl_red_type = cnclMax;
+        break;
+
+      case kRedMin:
+        cncl_red_type = cnclMin;
+        break;
+
+      case kRedProd:
+        cncl_red_type = cnclProd;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
+        sendbuff, recvbuff, numel, dtype, cncl_red_type, comm->comm(), stream));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with MLU."));
+#endif
+  }
+};
+
 class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4879696b3f47032dd30e35b2ffba05af8fa2f609
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(c_allreduce_sum,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index 123fb2aafb524d89901959e57d378838acfdf0af..d315f211709e4f76c2d5c685721961a91c2102fe 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -30,7 +30,8 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     auto x = ctx.Input<framework::LoDTensor>("X");
     auto out = ctx.Output<framework::LoDTensor>("Out");
     int numel = x->numel();
-    cnclDataType_t dtype = platform::ToCNCLDataType(x->type());
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->type()));
 
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 4f1f1ec6512067fbb1e2e5af2bd3ef7cd5af4f9e..b5beb770909b56aed590020ccaaa71f50b96a75d 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -98,8 +99,8 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     const auto& labels_dims = labels->dims();
 
     const int axis = logits_dims.size() - 1;
-    const int N = SizeToAxis(axis, logits_dims);
-    const int D = SizeFromAxis(axis, logits_dims);
+    const int N = phi::funcs::SizeToAxis(axis, logits_dims);
+    const int D = phi::funcs::SizeFromAxis(axis, logits_dims);
 
     Tensor logits_2d, softmax_2d, loss_2d;
     logits_2d.ShareDataWith(*logits).Resize({N, D});
@@ -220,8 +221,8 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     }
     const auto sofrmax_dims = softmax->dims();
     const int axis = sofrmax_dims.size() - 1;
-    const int N = SizeToAxis(axis, sofrmax_dims);
-    const int D = SizeFromAxis(axis, sofrmax_dims);
+    const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims);
+    const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims);
 
     Tensor logit_grad_2d;
     logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
index c7cfd41fa2556873166701c96616323d2b1e40c3..f5399e3215d5822c05ca709d95af47eeab921104 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/softmax_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h
index 2a815ef01e1f7acbfa7f1a3d6ea6808c9877155e..b2173d1b53104a132e721cd3f72f7c6e7ace4af1 100644
--- a/paddle/fluid/operators/conj_op.h
+++ b/paddle/fluid/operators/conj_op.h
@@ -37,7 +37,7 @@ class ConjKernel : public framework::OpKernel<T> {
 
     // call new kernel
     phi::ConjKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *x, out);
   }
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 31ed10a71201c666c72e23853fdf925a42a80fb3..6bf419c47a5669b87c0b47d48259362a66a23239 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -272,8 +272,18 @@ class ConditionalBlockGradInferVarType : public framework::VarTypeInference {
     // Input is {Tensor, LoDTensorArray}, we need synchronous the Input's
     // VarType into Input@GRAD to avoid generating {Tensor, Tensor} as
     // Input@GRAD.
-    ctx->SyncTypeAndDataType(ConditionalOp::kInputs,
-                             framework::GradVarName(ConditionalOp::kInputs));
+    auto input_size = ctx->InputSize(ConditionalOp::kInputs);
+    auto output_size =
+        ctx->OutputSize(framework::GradVarName(ConditionalOp::kInputs));
+    PADDLE_ENFORCE_EQ(input_size, output_size,
+                      platform::errors::InvalidArgument(
+                          "input_size and output_size should be equal for "
+                          "conditional_block_grad_op."));
+    for (size_t i = 0; i < output_size; ++i) {
+      ctx->SyncTypeAndDataType(ConditionalOp::kInputs,
+                               framework::GradVarName(ConditionalOp::kInputs),
+                               i);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index ed4995d4fbeda208bfdd09be52c98195b52786db..de3d8bd996149f92ed24be63fadacfc51c2764b0 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index bda5ac42da8e3845b76880209438bfed1cacc6e0..dff60afd74c02f458b5b3c7428c2703197b61af0 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 DECLARE_bool(cudnn_deterministic);
 DECLARE_uint64(conv_workspace_size_limit);
diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc
index e6b30ba42fc2664404ecc51bb68c8e3c06a26dc1..fe00ee06603f0ecf2e3fa6ac367303a70702508f 100644
--- a/paddle/fluid/operators/cross_op.cc
+++ b/paddle/fluid/operators/cross_op.cc
@@ -12,67 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/cross_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
 using framework::DDim;
+const int kDefaultDim = framework::DDim::kMaxRank;
 
 class CrossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of CrossOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of CrossOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of CrossOp should not be null."));
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto y_dim = ctx->GetInputDim("Y");
-    auto dim = ctx->Attrs().Get<int>("dim");
-
-    bool dims_match = CheckDims(x_dim, y_dim);
-    PADDLE_ENFORCE_EQ(dims_match, true,
-                      platform::errors::InvalidArgument(
-                          "The 'shape' of Input(X) should be equal to "
-                          "the 'shape' of Input(Y). But received "
-                          "Input(X).dimensions = [%s], "
-                          "Input(Y).dimensions = [%s]",
-                          x_dim, y_dim));
-
-    if (dim != kDefaultDim) {
-      PADDLE_ENFORCE_EQ(
-          dim < x_dim.size() && dim >= (0 - x_dim.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(dim) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-              x_dim.size(), x_dim.size() - 1, dim));
-      if (dim < 0) {
-        dim += x_dim.size();
-      }
-      PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, true,
-                        platform::errors::InvalidArgument(
-                            "Input(X/Y).dims()[dim] should be equal to 3."
-                            "But received Input(X/Y).dims()[dim] = %d.",
-                            x_dim[dim]));
-    }
-
-    ctx->SetOutputDim("Out", x_dim);
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -153,17 +109,10 @@ class CrossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
+                            PT_INFER_META(phi::CrossInferMeta));
 REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker,
                   ops::CrossGradMaker<paddle::framework::OpDesc>,
-                  ops::CrossGradMaker<paddle::imperative::OpBase>);
+                  ops::CrossGradMaker<paddle::imperative::OpBase>,
+                  CrossInferShapeFunctor);
 REGISTER_OPERATOR(cross_grad, ops::CrossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    cross, ops::CrossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CrossKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CrossKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CrossKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    cross_grad, ops::CrossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CrossGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CrossGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CrossGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/cross_op.cu b/paddle/fluid/operators/cross_op.cu
deleted file mode 100644
index 78bbb3ea564544a46e19723e9a83e90194b50597..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cross_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/cross_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cross, ops::CrossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CrossKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CrossKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CrossKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    cross_grad,
-    ops::CrossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CrossGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CrossGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CrossGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/cross_op.h b/paddle/fluid/operators/cross_op.h
deleted file mode 100644
index b1c5eb62fdce57640e4b6c1a9bf1f55d59d1c6d6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cross_op.h
+++ /dev/null
@@ -1,222 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-const int kDefaultDim = framework::DDim::kMaxRank;
-
-inline bool CheckDims(const DDim& dims_x, const DDim& dims_y) {
-  if (dims_x.size() != dims_y.size()) {
-    return false;
-  }
-  for (int i = 0; i < dims_x.size(); i++) {
-    if (dims_x[i] != dims_y[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename DeviceContext, typename T>
-class CrossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_x_var = context.InputVar("X");
-    auto* input_y_var = context.InputVar("Y");
-    auto* output_var = context.OutputVar("Out");
-
-    auto& input_x = input_x_var->Get<LoDTensor>();
-    auto& input_y = input_y_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    int dim = context.Attr<int>("dim");
-
-    auto input_x_dims = input_x.dims();
-    auto input_y_dims = input_y.dims();
-    bool dims_match = CheckDims(input_x_dims, input_y_dims);
-    PADDLE_ENFORCE_EQ(dims_match, true,
-                      platform::errors::InvalidArgument(
-                          "The 'shape' of Input(X) should be equal to "
-                          "the 'shape' of Input(Y). But received "
-                          "Input(X).dimensions = [%s], "
-                          "Input(Y).dimensions = [%s]",
-                          input_x_dims, input_x_dims));
-
-    if (dim != kDefaultDim) {
-      PADDLE_ENFORCE_EQ(
-          dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(dim) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-              input_x_dims.size(), input_x_dims.size() - 1, dim));
-      if (dim < 0) {
-        dim += input_x_dims.size();
-      }
-
-      PADDLE_ENFORCE_EQ(
-          input_x_dims[dim] == 3, true,
-          platform::errors::InvalidArgument(
-              "Input(X/Y).dims[dim] must be equal to 3. But received: "
-              "Input(X/Y).dims[dim] = [%d].",
-              input_x_dims[dim]));
-    } else {
-      for (auto i = 0; i < input_x_dims.size(); i++) {
-        if (input_x_dims[i] == 3) {
-          dim = i;
-          break;
-        }
-      }
-      PADDLE_ENFORCE_EQ(dim == kDefaultDim, false,
-                        platform::errors::InvalidArgument(
-                            "There must be at least one dimension 'd' so that "
-                            "Input(X/Y).dims()[d] is equal to 3. "
-                            "But received: Input(X/Y).dims() == [%s].",
-                            input_x_dims));
-    }
-    auto outer_loops = 1;
-    for (auto i = 0; i < dim; i++) {
-      outer_loops *= input_x_dims[i];
-    }
-    auto slice_size = 1;
-    for (auto i = dim + 1; i < input_x_dims.size(); i++) {
-      slice_size *= input_x_dims[i];
-    }
-
-    std::vector<T> input_x_vec, input_y_vec;
-    framework::TensorToVector(input_x, context.device_context(), &input_x_vec);
-    framework::TensorToVector(input_y, context.device_context(), &input_y_vec);
-    std::vector<T> out_vec(output->numel());
-
-    output->mutable_data<T>(context.GetPlace());
-
-    for (auto i = 0; i < outer_loops; i++) {
-      for (auto j = 0; j < 3; j++) {
-        auto dst_pos = (3 * i + j) * slice_size;
-        auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
-        auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
-
-        for (auto k = 0; k < slice_size; k++) {
-          out_vec[dst_pos + k] =
-              input_x_vec[in_pos1 + k] * input_y_vec[in_pos2 + k] -
-              input_x_vec[in_pos2 + k] * input_y_vec[in_pos1 + k];
-        }
-      }
-    }
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input_x_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CrossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_x_var = context.InputVar("X");
-    auto* input_y_var = context.InputVar("Y");
-    auto* input_out_grad_var = context.InputVar(framework::GradVarName("Out"));
-    auto* output_x_grad_var = context.OutputVar(framework::GradVarName("X"));
-    auto* output_y_grad_var = context.OutputVar(framework::GradVarName("Y"));
-
-    auto& input_x = input_x_var->Get<LoDTensor>();
-    auto& input_y = input_y_var->Get<LoDTensor>();
-    auto& input_out_grad = input_out_grad_var->Get<LoDTensor>();
-    auto* output_x_grad = output_x_grad_var->GetMutable<LoDTensor>();
-    auto* output_y_grad = output_y_grad_var->GetMutable<LoDTensor>();
-
-    int dim = context.Attr<int>("dim");
-    auto input_x_dims = input_x.dims();
-    if (dim != kDefaultDim) {
-      PADDLE_ENFORCE_EQ(
-          dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(dim) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-              input_x_dims.size(), input_x_dims.size() - 1, dim));
-      if (dim < 0) {
-        dim += input_x_dims.size();
-      }
-
-      PADDLE_ENFORCE_EQ(
-          input_x_dims[dim] == 3, true,
-          platform::errors::InvalidArgument(
-              "Input(X/Y).dims[dim] must be equal to 3. But received: "
-              "Input(X/Y).dims[dim] = [%d].",
-              input_x_dims[dim]));
-    } else {
-      for (auto i = 0; i < input_x_dims.size(); i++) {
-        if (input_x_dims[i] == 3) {
-          dim = i;
-          break;
-        }
-      }
-      PADDLE_ENFORCE_EQ(dim == kDefaultDim, false,
-                        platform::errors::InvalidArgument(
-                            "There must be at least one dimension 'd' "
-                            "so that Input(X/Y).dims()[d] is equal to 3. "
-                            "But received: Input(X/Y).dims() == [%s].",
-                            input_x_dims));
-    }
-    auto outer_loops = 1;
-    for (auto i = 0; i < dim; i++) {
-      outer_loops *= input_x_dims[i];
-    }
-    auto slice_size = 1;
-    for (auto i = dim + 1; i < input_x_dims.size(); i++) {
-      slice_size *= input_x_dims[i];
-    }
-
-    std::vector<T> input_x_vec, input_y_vec, input_dout_vec;
-    framework::TensorToVector(input_x, context.device_context(), &input_x_vec);
-    framework::TensorToVector(input_y, context.device_context(), &input_y_vec);
-    framework::TensorToVector(input_out_grad, context.device_context(),
-                              &input_dout_vec);
-    std::vector<T> out_dx_vec(output_x_grad->numel());
-    std::vector<T> out_dy_vec(output_y_grad->numel());
-
-    output_x_grad->mutable_data<T>(context.GetPlace());
-    output_y_grad->mutable_data<T>(context.GetPlace());
-
-    for (auto i = 0; i < outer_loops; i++) {
-      for (auto j = 0; j < 3; j++) {
-        auto dst_pos = (3 * i + j) * slice_size;
-        auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
-        auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
-        for (auto k = 0; k < slice_size; k++) {
-          out_dx_vec[dst_pos + k] =
-              input_dout_vec[in_pos2 + k] * input_y_vec[in_pos1 + k] -
-              input_dout_vec[in_pos1 + k] * input_y_vec[in_pos2 + k];
-          out_dy_vec[dst_pos + k] =
-              input_dout_vec[in_pos1 + k] * input_x_vec[in_pos2 + k] -
-              input_dout_vec[in_pos2 + k] * input_x_vec[in_pos1 + k];
-        }
-      }
-    }
-    framework::TensorFromVector(out_dx_vec, context.device_context(),
-                                output_x_grad);
-    framework::TensorFromVector(out_dy_vec, context.device_context(),
-                                output_y_grad);
-    output_x_grad->Resize(input_x_dims);
-    output_y_grad->Resize(input_x_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 8a44c1327b9e6fbb1f8767a9ecdf40faf95993eb..b1f2e61ef3930d81aa56794c0d232930452b03d9 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -110,10 +110,12 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
       // merge elements and delete blank
       T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
 
+      paddle::framework::MixVector<size_t> mixv_input_lod(&input_lod[level]);
       MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
           num_tokens, tokens, num_seq,
-          input_lod[level].CUDAMutableData(ctx.GetPlace()), blank,
-          merge_repeated, dev_out_lod0_ptr, output_data);
+          mixv_input_lod.CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
+          dev_out_lod0_ptr, output_data);
+      mixv_input_lod.CopyToCPU();
 
       // set output lod
       std::vector<size_t> host_out_lod0(dev_out_lod0.begin(),
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index ad96dc24b9206c0e7c6bc172180cec829230dde1..1a3bdee53e9bd31b410093446280a18e2f75d7a2 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -149,11 +149,12 @@ class CVMGradCUDAKernel : public framework::OpKernel<T> {
           batch_size, lod[lod.size() - 1],
           platform::errors::PreconditionNotMet(
               "Output(X@GRAD)'s dim[0] must be equal to last element of lod"));
+      paddle::framework::MixVector<size_t> mixv_lod(&lod);
       CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) /
                                  PADDLE_CUDA_NUM_THREADS,
                              PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
           use_cvm, item_size, cvm_data, dout_data, dx_data, true,
-          lod.CUDAData(context.GetPlace()), lod.size(), dx_numel);
+          mixv_lod.CUDAData(context.GetPlace()), lod.size(), dx_numel);
     }
   }
 };
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index bda22dd0155cce6cec767dfe1c3b282788a5f160..65f2a5590716d42649dbf766575c72571c23eb4d 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -57,9 +57,11 @@ class GPUBoxClipKernel : public framework::OpKernel<T> {
     auto stream = dev_ctx.stream();
     const size_t batch_size = lod.back().size() - 1;
     T *output_data = output->mutable_data<T>(dev_ctx.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&abs_offset_lod[0]);
     GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>(
-        input->data<T>(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()),
+        input->data<T>(), mix_vector.CUDAMutableData(dev_ctx.GetPlace()),
         bbox_width, im_info->data<T>(), output_data);
+    mix_vector.CopyToCPU();
   }
 };
 
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index 01b15865e93b6035598b382b506504e9fcc22698..c4506f04e083e0a1e7671605ef6e39a06aa68eed 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -108,7 +108,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
 
     auto x_lod = x->lod().back();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_x_lod(&x_lod);
+    size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace());
 #else
     size_t* x_lod_data = x_lod.data();
 #endif
@@ -116,6 +117,9 @@ class TargetAssignKernel : public framework::OpKernel<T> {
     TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
                                        mismatch_value, n, m, p, k, out_data,
                                        out_wt_data);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    mixv_x_lod.CopyToCPU();
+#endif
 
     auto& device_ctx = ctx.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(device_ctx, n * m);
@@ -130,13 +134,17 @@ class TargetAssignKernel : public framework::OpKernel<T> {
       const int* neg_idx_data = neg_indices->data<int>();
       auto neg_lod = neg_indices->lod().back();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+      paddle::framework::MixVector<size_t> mixv_neg_lod(&neg_lod);
+      size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace());
 #else
       size_t* neg_lod_data = neg_lod.data();
 #endif
       NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
       neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
                       mismatch_value, out_data, out_wt_data);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      mixv_neg_lod.CopyToCPU();
+#endif
     }
   }
 };
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index 30ea323733238cd30e8a4e440e1cab08d90c64f0..0160277dc79af50c555b1257e6ffa216b7b56b62 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/diag_v2_op.h"
 #include <algorithm>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -23,44 +25,6 @@ namespace operators {
 class DiagV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "diag_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diag_v2");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto offset = ctx->Attrs().Get<int>("offset");
-
-    if (x_dims.size() == 1UL) {
-      int64_t size_ = x_dims[0] + std::abs(offset);
-      ctx->SetOutputDim("Out", {size_, size_});
-    } else if (x_dims.size() == 2UL) {
-      int64_t size_ = 0;
-      if (offset >= 0) {
-        // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-        // of `size_` will have unexpected result on Windows Python3.8
-        if (x_dims[0] < x_dims[1] - offset) {
-          size_ = x_dims[0];
-        } else {
-          size_ = x_dims[1] - offset;
-        }
-      } else {
-        // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-        // of `size_` will have unexpected result on Windows Python3.8
-        if (x_dims[0] + offset < x_dims[1]) {
-          size_ = x_dims[0] + offset;
-        } else {
-          size_ = x_dims[1];
-        }
-      }
-      ctx->SetOutputDim("Out", {size_});
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The input tensor X's dimensions of DiagV2Op should be either 1 or "
-          "2, but received %d.",
-          x_dims.size()));
-    }
-  }
 };
 
 class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
@@ -94,59 +58,15 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-template <typename DeviceContext, typename T>
-class DiagV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* x_data = X->data<T>();
-    auto x_dims = X->dims();
-    int offset = context.Attr<int>("offset");
-    auto* out = context.Output<framework::Tensor>("Out");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_dims = out->dims();
-
-    int64_t i;
-    if (x_dims.size() == 1) {
-      float padding_value = context.Attr<float>("padding_value");
-      phi::funcs::SetConstant<DeviceContext, T> set_padding_value;
-      auto& dev_ctx = context.template device_context<DeviceContext>();
-      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
-
-      auto x_length = x_dims[0];
-      const int& x_stride = ComputeStride(0, x_dims);
-
-      auto out_stride_0 = ComputeStride(0, out_dims);
-      auto out_stride_1 = ComputeStride(1, out_dims);
-      out_data +=
-          (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
-
-      for (i = 0; i < x_length; i++) {
-        out_data[i * (out_stride_0 + out_stride_1)] = x_data[i * x_stride];
-      }
-    } else {
-      auto out_length = out_dims[0];
-      const int& x_stride_0 = ComputeStride(0, x_dims);
-      const int& x_stride_1 = ComputeStride(1, x_dims);
-
-      auto out_stride_0 = ComputeStride(0, out_dims);
-      x_data += (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
-      for (i = 0; i < out_length; i++) {
-        out_data[i * out_stride_0] = x_data[i * (x_stride_0 + x_stride_1)];
-      }
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
+                            PT_INFER_META(phi::DiagInferMeta));
+
 REGISTER_OPERATOR(
     diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    diag_v2, ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    DiagInferShapeFunctor);
diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
deleted file mode 100644
index 9b83b68bea159a9688a80e1b71eecaacb917153b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/diag_v2_op.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <tuple>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/diag_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-// Extract the diagonal of a matrix 'x' to a vector 'out'.
-template <typename T>
-__global__ void ExtractDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
-                                      std::ptrdiff_t size,
-                                      const std::ptrdiff_t sumStride,
-                                      const std::ptrdiff_t outStride) {
-  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
-       idx += gridDim.x * blockDim.x) {
-    const std::ptrdiff_t xOffset = start + sumStride * idx;
-    out[outStride * idx] = x[xOffset];
-  }
-}
-
-// Paste a vector 'x' to the diagonal of a matrix 'out'
-template <typename T>
-__global__ void PasteDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
-                                    std::ptrdiff_t x_length,
-                                    const std::ptrdiff_t sumStride,
-                                    const std::ptrdiff_t xStride) {
-  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-       idx < x_length; idx += gridDim.x * blockDim.x) {
-    const std::ptrdiff_t outOffset = start + sumStride * idx;
-    out[outOffset] = x[xStride * idx];
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DiagV2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* x_data = X->data<T>();
-    auto x_dims = X->dims();
-    int offset = context.Attr<int>("offset");
-    auto* out = context.Output<framework::Tensor>("Out");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_dims = out->dims();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    auto GetBlockGridSize = [&dev_ctx](int64_t size) {
-      const int64_t block_size =
-          std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
-      int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
-                                          static_cast<int64_t>(1));
-      const int64_t grid_size =
-          std::min(max_blocks, (size + block_size - 1) / block_size);
-      return std::tuple<int64_t, int64_t>{block_size, grid_size};
-    };
-
-    if (x_dims.size() == 1) {
-      float padding_value = context.Attr<float>("padding_value");
-      phi::funcs::SetConstant<DeviceContext, T> set_padding_value;
-      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
-
-      auto x_length = x_dims[0];
-      auto size = (offset > 0) ? x_length + offset : x_length - offset;
-      const int& x_stride = ComputeStride(0, x_dims);
-      if (size > 0) {
-        const auto& out_stride_0 = ComputeStride(0, out_dims);
-        const auto& out_stride_1 = ComputeStride(1, out_dims);
-        auto start =
-            (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
-
-        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
-
-        PasteDiagonalKernel<
-            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
-                 dev_ctx.stream()>>>(out_data, x_data, start, x_length,
-                                     out_stride_0 + out_stride_1, x_stride);
-      }
-    } else {
-      const int& x_stride_0 = ComputeStride(0, x_dims);
-      const int& x_stride_1 = ComputeStride(1, x_dims);
-
-      int64_t size;
-      if (offset > 0) {
-        size = std::min(x_dims[0], x_dims[1] - offset);
-      } else {
-        size = std::min(x_dims[0] + offset, x_dims[1]);
-      }
-
-      if (size > 0) {
-        auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
-        const auto& out_stride_0 = ComputeStride(0, out_dims);
-
-        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
-
-        ExtractDiagonalKernel<
-            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
-                 dev_ctx.stream()>>>(out_data, x_data, start, size,
-                                     x_stride_0 + x_stride_1, out_stride_0);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    diag_v2, ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h
index ca6bcb1147a2fb78031227f0bb3a9f7e01326fcb..c13bf687af23470d4595def6fb6fabf7385c999f 100644
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
@@ -180,8 +180,8 @@ struct normal_distribution<double> {
 /******** Launch GPU function of distribution and transformation *********/
 template <typename T, typename DistOp, typename TransformOp>
 __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
-                                   DistOp dist, TransformOp trans,
-                                   T *out_data) {
+                                   DistOp dist, TransformOp trans, T *out_data,
+                                   size_t stride) {
   size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
   static constexpr int kCount = DistOp::kReturnsCount;
 #if defined(__NVCC__)
@@ -201,7 +201,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
     kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(&result[0], &args[0],
                                                            trans);
     kps::WriteData<T, T, kCount, 1, 1, true>(out_data + i, &result[0], size - i,
-                                             1, total_thread, 1);
+                                             1, stride, 1);
+    __syncthreads();
   }
 }
 
@@ -234,7 +235,7 @@ void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
 
   DistributionKernel<
       T, DistOp, TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      size, seed, offset, dist, trans, out_data);
+      size, seed, offset, dist, trans, out_data, total_thread);
 }
 
 #endif
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index e8c28ebfeb00878c69b0e80aef5aa505630f40e8..7fd0a8eb164752f24f0fed4959b0036e1a400f5e 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -41,9 +41,9 @@ class DotKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(x->place());
 
     // call new kernel
-    phi::DotKernel<T, typename paddle::framework::ConvertToPtenContext<
+    phi::DotKernel<T, typename paddle::framework::ConvertToPhiContext<
                           DeviceContext>::TYPE>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *x, *y, out);
   }
@@ -66,7 +66,7 @@ class DotGradKernel : public framework::OpKernel<T> {
 
     // call new kernel
     phi::DotGradKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *tensor_x, *tensor_y, *tensor_dout, tensor_dx, tensor_dy);
   }
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 21fdf69ac570ac6972173d77194275d629ce436f..2fa956a2e6515e8b6a8e1c463c8ab8d1476f8d90 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
 namespace operators {
@@ -270,32 +271,38 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                                 const Tensor& mask, int64_t size,
                                 Tensor* grad_x, bool is_test = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
-  auto dX = EigenVector<T>::Flatten(*grad_x);
-  auto dY = EigenVector<T>::Flatten(grad_y);
-
-  auto& place = *dev_ctx.eigen_device();
+  auto stream = dev_ctx.stream();
+  MT factor;
   if (is_test) {
     if (dropout_implementation == "upscale_in_train") {
-      dX.device(place) = static_cast<T>(1) * dY;
+      factor = static_cast<MT>(1.0f);
     } else {
-      dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
+      factor = static_cast<MT>(1.0f - dropout_prob);
     }
+    std::vector<const framework::Tensor*> ins = {&grad_y};
+    std::vector<framework::Tensor*> outs = {grad_x};
+    auto functor = phi::funcs::ScaleFunctor<T>(factor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   } else {
-    auto M = EigenVector<uint8_t>::Flatten(mask);
+    std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
+    std::vector<framework::Tensor*> outs = {grad_x};
     if (dropout_implementation == "upscale_in_train") {
       if (dropout_prob == 1.0f) {
-        dX.device(place) = static_cast<T>(0) * dY;
+#ifdef PADDLE_WITH_HIP
+        hipMemset(grad_x->data<T>(), 0, size * sizeof(T));
+#else
+        cudaMemset(grad_x->data<T>(), 0, size * sizeof(T));
+#endif
       } else {
-        auto factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
-        auto stream = dev_ctx.stream();
-        std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
-        std::vector<framework::Tensor*> outs = {grad_x};
-        auto functor = CudaDropoutGradFunctor<T, uint8_t>(factor);
+        factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
         paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-            dev_ctx, ins, &outs, functor);
+            dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
       }
     } else {
-      dX.device(place) = dY * M.cast<T>();
+      factor = static_cast<MT>(1.0f);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+          dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
     }
   }
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 1a256f75675784549958c8dbc41684a9746818df..a995877778e4770ea8ae64c051a71b31c1fb1e29 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -13,7 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#ifdef __xpu__
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#else
 #include <algorithm>
 #include <utility>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -21,6 +28,7 @@ limitations under the License. */
 // only can include the headers in paddle/phi/include dirs
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
 #include "paddle/phi/kernels/math_kernel.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -28,7 +36,17 @@ namespace operators {
 template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifdef __xpu__
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    const auto& xpu_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T, kps::AddFunctor<T>, 1>(
+        xpu_ctx, ins, &outs, axis, kps::AddFunctor<T>());
+#else
     auto *x = ctx.Input<framework::LoDTensor>("X");
     auto *y = ctx.Input<framework::LoDTensor>("Y");
     auto *z = ctx.Output<framework::LoDTensor>("Out");
@@ -37,9 +55,10 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.device_context<DeviceContext>();
     int axis = ctx.Attr<int>("axis");
     phi::AddRawKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE &>(dev_ctx),
         *x, *y, axis, z);
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
similarity index 59%
rename from paddle/fluid/operators/elementwise/elementwise_add_op.cu
rename to paddle/fluid/operators/elementwise/elementwise_add_op.kps
index 2b55d9fbaf6cba83f722e29f6d5359a1a8884c84..d6e0749318e901947b46b4b1d6ff8bbdb16bef36 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,17 +12,50 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef PADDLE_WITH_XPU_KP
+
+// Please do not modify the following code
+#if defined(__CUDA_ARCH__)
+#undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+#undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+#undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+#undef __NVCC__
+#endif
+
+#include <xpu/runtime.h>                // NOLINT
+#include "xpu/kernel/cluster_header.h"  // NOLINT
+#include "xpu/kernel/debug.h"           // NOLINT
+#include "xpu/kernel/math.h"            // NOLINT
+
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#else
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/phi/kernels/gpu/elementwise.h"
+#endif
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+#ifdef PADDLE_WITH_XPU_KP
+REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
+                   ops::ElementwiseAddKernel<plat::XPUDeviceContext, float>);
+#else
 REGISTER_OP_CUDA_KERNEL(
     grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 06f9107db27b4f2cce54bbcabe3c53e81e4167d1..9eb4b0352e5337e3fdd758d2e95cfa61d1d62724 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -53,6 +53,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
                               paddle::platform::float16>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::bfloat16>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
@@ -65,6 +67,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
                                   paddle::platform::float16>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::bfloat16>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
@@ -78,6 +82,8 @@ REGISTER_OP_CUDA_KERNEL(
                                         float>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         paddle::platform::float16>,
+    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         double>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 1df43936920a9b7164c72d21619293301446aff6..c58a7f36548a57a1c8e7770fa282470fba4cc140 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -63,11 +63,11 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.device_context<DeviceContext>();
     int axis = ctx.Attr<int>("axis");
-    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
-    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
-    auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
+    auto pt_x = paddle::experimental::MakePhiDenseTensor(*x);
+    auto pt_y = paddle::experimental::MakePhiDenseTensor(*y);
+    auto pt_z = paddle::experimental::MakePhiDenseTensor(*z);
     phi::DivideRawKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *pt_x.get(), *pt_y.get(), axis, pt_z.get());
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 5ff0f29ab43a059fefa165dae5c6388231cc8182..e172279145e28c0731ed0d8d91769d0b293662fe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -167,6 +167,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
+                              paddle::platform::bfloat16>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
                               paddle::platform::complex<float>>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
@@ -178,6 +180,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
+                                  paddle::platform::bfloat16>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
                                   paddle::platform::complex<float>>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
@@ -194,6 +198,8 @@ REGISTER_OP_CPU_KERNEL(
                                         int64_t>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         bool>,
+    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         paddle::platform::complex<float>>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
@@ -210,6 +216,8 @@ REGISTER_OP_CPU_KERNEL(
                                         int64_t>,
     ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
                                         bool>,
+    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
                                         paddle::platform::complex<float>>,
     ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index e2f81d26d5a7bb959353e8390b2b70d824df0b56..45c87a27a180af4798a9f8b31e2edfd0cacb583d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -49,9 +49,9 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       z_lod->mutable_data<T>(ctx.GetPlace());
 
       int axis = ctx.Attr<int>("axis");
-      auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
-      auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
-      auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
+      auto pt_x = paddle::experimental::MakePhiDenseTensor(*x_lod);
+      auto pt_y = paddle::experimental::MakePhiDenseTensor(*y_lod);
+      auto pt_z = paddle::experimental::MakePhiDenseTensor(*z_lod);
       phi::MultiplyRawKernel<T>(static_cast<const phi::GPUContext&>(cuda_ctx),
                                 *pt_x.get(), *pt_y.get(), axis, pt_z.get());
     } else {
@@ -100,6 +100,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, bool>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<float>>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
@@ -110,6 +111,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, bool>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
                                   plat::complex<float>>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
@@ -122,6 +124,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, bool>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
+                                        plat::bfloat16>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
                                         plat::complex<float>>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
@@ -134,6 +138,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, bool>,
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
+                                        plat::bfloat16>,
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
                                         plat::complex<float>>,
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 93713be051599966d3b7fc5efa7329247096e0ca..c81266d584468f51030026e1423a649252001f58 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -122,11 +122,11 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
 
       auto& dev_ctx = ctx.device_context<DeviceContext>();
       int axis = ctx.Attr<int>("axis");
-      auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
-      auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
-      auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
+      auto pt_x = paddle::experimental::MakePhiDenseTensor(*x_lod);
+      auto pt_y = paddle::experimental::MakePhiDenseTensor(*y);
+      auto pt_z = paddle::experimental::MakePhiDenseTensor(*z_lod);
       phi::MultiplyRawKernel<T>(
-          static_cast<const typename framework::ConvertToPtenContext<
+          static_cast<const typename framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           *pt_x.get(), *pt_y.get(), axis, pt_z.get());
     } else {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7505890f41d441cbbf958cda3e86b36343e1b2c
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using MLUDeviceContext = platform::MLUDeviceContext;
+
+static void GetReduceAxes(const int axis, const framework::DDim& src_ddims,
+                          const framework::DDim& target_ddims,
+                          std::vector<int>* axes) {
+  int64_t src_dim_size = src_ddims.size();
+  int64_t target_dim_size = target_ddims.size();
+  for (int64_t i = 0; i < src_dim_size; ++i) {
+    if (i < axis || i >= target_dim_size + axis) {
+      axes->push_back(i);
+      continue;
+    }
+    if (src_ddims[i] > target_ddims[i - axis]) {
+      axes->push_back(i);
+    }
+  }
+}
+
+template <typename T>
+class ElementwiseMulMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+    const auto& x_dims = x->dims();
+    const auto& y_dims = y->dims();
+    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                     : axis);
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                           y_dims_array.data(), out_dims_array.data(), max_dim,
+                           axis);
+
+    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                       CNNL_NOT_PROPAGATE_NAN);
+
+    MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x),
+                      y_desc.get(), GetBasePtr(y), out_desc.get(),
+                      GetBasePtr(out), ToCnnlDataType<T>());
+  }
+};
+
+template <typename T>
+class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    const auto& x_dims = x->dims();
+    const auto& y_dims = y->dims();
+    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                     : axis);
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                           y_dims_array.data(), out_dims_array.data(), max_dim,
+                           axis);
+
+    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      if (dx->dims() == dout->dims()) {
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), y_desc.get(), GetBasePtr(y),
+                          x_desc.get(), GetBasePtr(dx), ToCnnlDataType<T>());
+      } else {
+        Tensor dx_temp(x->dtype());
+        dx_temp.Resize(dout->dims());
+        dx_temp.mutable_data<T>(ctx.GetPlace());
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), y_desc.get(), GetBasePtr(y),
+                          dout_desc.get(), GetBasePtr(&dx_temp),
+                          ToCnnlDataType<T>());
+
+        std::vector<int> reduce_axes;
+        GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dx_desc(*dx);
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(&dx_temp), 0,
+                        nullptr, nullptr, dx_desc.get(), GetBasePtr(dx));
+      }
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      if (dy->dims() == dout->dims()) {
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), x_desc.get(), GetBasePtr(x),
+                          y_desc.get(), GetBasePtr(dy), ToCnnlDataType<T>());
+      } else {
+        Tensor dy_temp(y->dtype());
+        dy_temp.Resize(dout->dims());
+        dy_temp.mutable_data<T>(ctx.GetPlace());
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), x_desc.get(), GetBasePtr(x),
+                          dout_desc.get(), GetBasePtr(&dy_temp),
+                          ToCnnlDataType<T>());
+
+        std::vector<int> reduce_axes;
+        GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dy_desc(*dy);
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(&dy_temp), 0,
+                        nullptr, nullptr, dy_desc.get(), GetBasePtr(dy));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(elementwise_mul, ops::ElementwiseMulMLUKernel<float>,
+                       ops::ElementwiseMulMLUKernel<paddle::platform::float16>,
+                       ops::ElementwiseMulMLUKernel<int>);
+
+REGISTER_OP_MLU_KERNEL(
+    elementwise_mul_grad, ops::ElementwiseMulGradMLUKernel<float>,
+    ops::ElementwiseMulGradMLUKernel<paddle::platform::float16>,
+    ops::ElementwiseMulGradMLUKernel<int>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 52de5f77ed325321513d58530ec37ec0e4a23adc..418779c32e8bc216be1532bf714bc21d91c452aa 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -31,18 +31,18 @@ void LaunchElementwiseCudaKernel(
   std::vector<phi::DenseTensor *> pt_outputs;
   // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary
   // DenseTensor obj
-  // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp
+  // generated by MakePhiDenseTensor can be destroyed when exits loop. *_tmp
   // can be deleted
   // when DenseTensor support copy constructor.
   std::vector<std::unique_ptr<phi::DenseTensor>> pt_inputs_tmp;
   std::vector<std::unique_ptr<phi::DenseTensor>> pt_outputs_tmp;
   for (auto in : ins) {
     pt_inputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*in)));
+        std::move(paddle::experimental::MakePhiDenseTensor(*in)));
   }
   for (auto out : *outs) {
     pt_outputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*out)));
+        std::move(paddle::experimental::MakePhiDenseTensor(*out)));
   }
   for (int i = 0; i < pt_inputs_tmp.size(); i++) {
     pt_inputs.push_back(pt_inputs_tmp[i].get());
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 4a2d92a8c441a9e180c056a19a417be1497c8bae..7d7bb4f26fcf42ec63cd1fab7ec2667a03c8ba4c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 
 // only can include the headers in paddle/top/api dirs
@@ -34,18 +34,18 @@ void LaunchSameDimsElementwiseCudaKernel(
   std::vector<phi::DenseTensor *> pt_outputs;
   // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary
   // DenseTensor obj
-  // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp
+  // generated by MakePhiDenseTensor can be destroyed when exits loop. *_tmp
   // can be deleted
   // when DenseTensor support copy constructor.
   std::vector<std::unique_ptr<phi::DenseTensor>> pt_inputs_tmp;
   std::vector<std::unique_ptr<phi::DenseTensor>> pt_outputs_tmp;
   for (auto in : ins) {
     pt_inputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*in)));
+        std::move(paddle::experimental::MakePhiDenseTensor(*in)));
   }
   for (auto out : *outs) {
     pt_outputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*out)));
+        std::move(paddle::experimental::MakePhiDenseTensor(*out)));
   }
   for (int i = 0; i < pt_inputs_tmp.size(); i++) {
     pt_inputs.push_back(pt_inputs_tmp[i].get());
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 8d9a2159069423cb3b51517016570057232d2c90..b2cef95d1a349d66161db1c3edf7c14bc8a6d058 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -99,6 +99,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int16_t>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
+                              paddle::platform::bfloat16>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
                               paddle::platform::complex<float>>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
@@ -110,6 +112,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int16_t>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
+                                  paddle::platform::bfloat16>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
                                   paddle::platform::complex<float>>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
@@ -126,6 +130,8 @@ REGISTER_OP_CPU_KERNEL(
                                         int>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
+    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         paddle::platform::complex<float>>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 038cefe372fe6c8132066d59eb5abd7d064a5c97..2c962af9877b978f7a6af25635f345c0ae5ffd27 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -22,6 +22,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
                               paddle::platform::float16>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::bfloat16>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
@@ -34,6 +36,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                   paddle::platform::float16>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::bfloat16>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
@@ -51,6 +55,8 @@ REGISTER_OP_CUDA_KERNEL(
                                         int>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         int64_t>,
+    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         paddle::platform::complex<float>>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 87b647f41352f4fd3cc130597f39c12221c7a903..15c547b493ae045c13ab8d6b14a646cb92716a92 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -34,7 +34,7 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.device_context<DeviceContext>();
     int axis = ctx.Attr<int>("axis");
     phi::SubtractRawKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *x, *y, axis, z);
   }
@@ -56,7 +56,7 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
     auto& dev_ctx = ctx.device_context<DeviceContext>();
 
     phi::SubtractGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *x, *y, *dout, axis, dx, dy);
   }
@@ -86,7 +86,7 @@ class ElementwiseSubDoubleGradKernel : public framework::OpKernel<T> {
       ddy_optional = *ddy;
     }
     phi::SubtractDoubleGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *y, ddx_optional, ddy_optional, *dout, axis, ddout);
   }
diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h
index 42c951385a438709569be58507a39230ad77a22d..cb466fffcd7c7358b6e84c18b7895a17b2eaa907 100644
--- a/paddle/fluid/operators/empty_op.h
+++ b/paddle/fluid/operators/empty_op.h
@@ -39,7 +39,7 @@ class EmptyKernel : public framework::OpKernel<T> {
     out_tensor->Resize(shape);
 
     out_tensor->mutable_data(context.GetPlace(),
-                             framework::TransToPtenDataType(dtype));
+                             framework::TransToPhiDataType(dtype));
   }
 };
 
diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc
index 56a6a80b45dff81dcd63efb2db15f2c0c70ab5ee..3d409b4c4f6772bc7b234208e78c5088eeb2fc00 100644
--- a/paddle/fluid/operators/erfinv_op.cc
+++ b/paddle/fluid/operators/erfinv_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/erfinv_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,14 +23,6 @@ namespace operators {
 class ErfinvOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "erfinv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "erfinv");
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class ErfinvOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -78,23 +73,13 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(
     erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker,
     paddle::operators::ErfinvGradMaker<paddle::framework::OpDesc>,
     paddle::operators::ErfinvGradMaker<paddle::imperative::OpBase>,
-    paddle::operators::ErfinvInplaceInferer);
+    paddle::operators::ErfinvInplaceInferer, ErfinvInferShapeFunctor);
 
 REGISTER_OPERATOR(erfinv_grad, paddle::operators::ErfinvGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    erfinv,
-    paddle::operators::ErfinvKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ErfinvKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
-
-REGISTER_OP_CPU_KERNEL(
-    erfinv_grad,
-    paddle::operators::ErfinvGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    paddle::operators::ErfinvGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>);
diff --git a/paddle/fluid/operators/erfinv_op.h b/paddle/fluid/operators/erfinv_op.h
deleted file mode 100644
index 934d0f4a5a7152fbd909082213f2ee7afa22d47f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/erfinv_op.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
-#endif
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-// ndtri(x * 0.5 + 0.5) / sqrt(2)
-template <typename DeviceContext, typename T>
-class ErfinvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    constexpr T half = static_cast<T>(0.5);
-    constexpr T half_sqrt = static_cast<T>(M_SQRT1_2);
-    eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt;
-  }
-};
-
-// sqrt(pi) / 2 * exp(square(out)) * grad
-template <typename DeviceContext, typename T>
-class ErfinvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto out = ctx.Input<framework::Tensor>("Out");
-    auto dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    constexpr T half_sqrt_pi = static_cast<T>(1 / M_2_SQRTPI);
-    eigen_dx.device(place) =
-        half_sqrt_pi * eigen_dout * eigen_out.square().exp();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index 793519b40182114c13e63dd32caaa382d55fa52d..8f8a0f174a79f13f0bee7aa7b425f8c645e15687 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eye_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -82,14 +82,8 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
-                       ops::EyeKernel<CPU, double>,
-                       ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
-                       ops::EyeKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h
deleted file mode 100644
index 4cec5387e82aa1bbd4bdeb8fbc9681b468e1a0f3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eye_op.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct EyeFunctor {
-  EyeFunctor(int64_t num_columns, T* output)
-      : num_columns_(num_columns), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    output_[idx * num_columns_ + idx] = static_cast<T>(1);
-  }
-
-  int64_t num_columns_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-class EyeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto num_rows = ctx.Attr<int64_t>("num_rows");
-    auto num_columns = ctx.Attr<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-
-    auto* out_tensor = ctx.Output<framework::Tensor>("Out");
-    T* out_data = out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    set_zero(dev_ctx, out_tensor, static_cast<T>(0));
-
-    int64_t num_eyes = (std::min)(num_rows, num_columns);
-    platform::ForRange<DeviceContext> for_range(dev_ctx, num_eyes);
-    EyeFunctor<T> functor(num_columns, out_data);
-    for_range(functor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc
index e109e9d02a03af6bc6c5e440745d3cac2349492f..5ee3202af135bf1941639a0fcb9d9c69d0d13f45 100644
--- a/paddle/fluid/operators/eye_op_npu.cc
+++ b/paddle/fluid/operators/eye_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eye_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
index ed68bd7b7c2a5fd2aa9cd05c006d91c96b5b8625..e6de430a78c1a38439cbc78f4e930a7bbe1bc463 100644
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_any_like_op.h"
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
+
 namespace paddle {
 namespace operators {
 
@@ -91,14 +92,3 @@ REGISTER_OPERATOR(
     ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::FillAnyLikeVarTypeInference)
-
-REGISTER_OP_CPU_KERNEL(
-    fill_any_like,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_any_like_op.cu b/paddle/fluid/operators/fill_any_like_op.cu
deleted file mode 100644
index 3ebc0ad7c8ec53b5c3de68823d9ba943e49bd364..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_any_like_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/fill_any_like_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_any_like,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext,
-                           paddle::platform::float16>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
deleted file mode 100644
index 36b56394b6f1b1198c65cb7a6a6046d223b31922..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <limits>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
-
-#include "paddle/phi/kernels/full_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillAnyLikeKernel : public framework::OpKernel<T> {
- public:
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, platform::float16>::value,
-                                float, T>::type>::type;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    // TODO(fangzeyang): Once context.Attribute supports double dtype, this
-    // kernel should be updated to support double dtype, too.
-    float value = context.Attr<float>("value");
-
-    auto common_type_value = static_cast<CommonType>(value);
-
-    PADDLE_ENFORCE_EQ(
-        (common_type_value >=
-         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-            (common_type_value <=
-             static_cast<CommonType>(std::numeric_limits<T>::max())),
-        true,
-        platform::errors::InvalidArgument(
-            "The filled value is out of range for target type, "
-            "current kernel type is %s, the range should between %f "
-            "and %f, but now value is %f.",
-            typeid(T).name(),
-            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-            static_cast<CommonType>(std::numeric_limits<T>::max()), value));
-
-    PADDLE_ENFORCE_EQ(
-        std::isnan(value), false,
-        platform::errors::InvalidArgument("The filled value is NaN."));
-
-    const auto& dev_ctx = context.template device_context<DeviceContext>();
-    // call new kernel
-    phi::FullLikeKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x, value, phi::DataType::UNDEFINED, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc
index a584c1341dc0f280d483d5677ef2276b43c003d2..b02e60210c085bfcedb22fe915de6700575b0a4c 100644
--- a/paddle/fluid/operators/fill_any_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_any_like_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
@@ -54,7 +54,7 @@ class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
         std::isnan(value), false,
         platform::errors::InvalidArgument("The filled value is NaN."));
 
-    Tensor tensor_tmp(framework::TransToPtenDataType(data_type));
+    Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
     tensor_tmp.mutable_data<T>({1}, context.GetPlace());
     FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value));
 
diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc
index 693d4431b2ec8e0546dfe125d3d7bd00f70993c9..ec4ba6e926c41bab8d7ceda20486db39f2d4dabe 100644
--- a/paddle/fluid/operators/fill_any_like_op_xpu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/fill_any_like_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/phi/kernels/full_kernel.h"
 
@@ -60,9 +60,9 @@ class FillAnyLikeXPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx =
         context.template device_context<paddle::platform::XPUDeviceContext>();
 
-    // call pten kernel
+    // call phi kernel
     phi::FullLikeKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             paddle::platform::XPUDeviceContext>::TYPE&>(dev_ctx),
         *x, value, phi::DataType::UNDEFINED, out);
   }
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
index 353f73cdd6d05374fe5c9b96dbde7b35ee675c1b..de06aeb01e4dda4e8ca4b4e70ca2c3ad6aa4b5dc 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     fill_constant_batch_size_like,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           paddle::platform::float16>,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
                                            float>,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
index 9d1d1eb7c6af523faa187c6aa4dd58cc8e077a29..31471c6b622684ac2134366bd23b8919ba1f93e5 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -63,7 +63,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPtenDataType(data_type));
+                        framework::TransToPhiDataType(data_type));
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
     }
@@ -72,7 +72,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
       out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPtenDataType(data_type));
+                        framework::TransToPhiDataType(data_type));
       functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
     }
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
index 9ce433a214dd5becfdd979e635eb83e75216bbaf..5bba4da14aba8bf2a6172b7e212dfca642f527fc 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -72,13 +72,13 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPtenDataType(data_type));
+                        framework::TransToPhiDataType(data_type));
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
     } else {
       out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPtenDataType(data_type));
-      Tensor tensor_tmp(framework::TransToPtenDataType(data_type));
+                        framework::TransToPhiDataType(data_type));
+      Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
       tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_tmp, value);
 
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index eccc53d8766e25b6f4445699e09f80581a28cf3e..d401b5b82f2b0defd3f2b17ed199d0bd01510859 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -122,7 +122,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
               << ((data_type == framework::proto::VarType::BF16) ? "<bfloat16>"
                                                                  : "<T>");
       tensor->mutable_data(platform::CPUPlace(),
-                           framework::TransToPtenDataType(data_type));
+                           framework::TransToPhiDataType(data_type));
       phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
@@ -130,7 +130,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 1) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       tensor->mutable_data(ctx.GetPlace(),
-                           framework::TransToPtenDataType(data_type));
+                           framework::TransToPhiDataType(data_type));
       phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
@@ -142,7 +142,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 2) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       tensor->mutable_data(platform::CUDAPinnedPlace(),
-                           framework::TransToPtenDataType(data_type));
+                           framework::TransToPhiDataType(data_type));
       phi::funcs::SetConstant<platform::CUDAPinnedDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(platform::CUDAPinnedPlace());
       functor(
@@ -155,7 +155,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 3) {
 #ifdef PADDLE_WITH_XPU
       tensor->mutable_data(ctx.GetPlace(),
-                           framework::TransToPtenDataType(data_type));
+                           framework::TransToPhiDataType(data_type));
       phi::funcs::SetConstant<platform::XPUDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       functor(reinterpret_cast<const platform::XPUDeviceContext &>(dev_ctx),
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index eb684f818fb08b7c27dbf137c6dd189168382064..79018f2a97448a8c6265a969dad37bce77d1b7ee 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -61,7 +61,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
 
     out_var->mutable_data<T>(shape, ctx.GetPlace());
     if (data_type != framework::proto::VarType::BOOL) {
-      Tensor tensor_value(framework::TransToPtenDataType(data_type));
+      Tensor tensor_value(framework::TransToPhiDataType(data_type));
       tensor_value.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_value, value);
       NpuOpRunner runner;
diff --git a/paddle/fluid/operators/fill_op.h b/paddle/fluid/operators/fill_op.h
index c202fa23ca891d459d658cd3eb1b080593c7801d..c5cbffbf5c695ffe9d16a530b4c84db094a72df2 100644
--- a/paddle/fluid/operators/fill_op.h
+++ b/paddle/fluid/operators/fill_op.h
@@ -49,10 +49,10 @@ class FillKernel : public framework::OpKernel<T> {
     out.Resize(phi::make_ddim(ctx.Attr<std::vector<int>>("shape")));
     auto dtype =
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto pten_dtype = framework::TransToPtenDataType(dtype);
+    auto phi_dtype = framework::TransToPhiDataType(dtype);
     platform::CPUPlace cpu;
     auto force_cpu = ctx.Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : ctx.GetPlace(), pten_dtype);
+    out.mutable_data(force_cpu ? cpu : ctx.GetPlace(), phi_dtype);
 
     framework::LoDTensor tensor;
 
@@ -61,7 +61,7 @@ class FillKernel : public framework::OpKernel<T> {
     } else {
       // Always make tensor in CPU memory.
       tensor.Resize(out.dims());
-      tensor.mutable_data(cpu, pten_dtype);
+      tensor.mutable_data(cpu, phi_dtype);
     }
 
     framework::VisitDataType(
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 3605eabfc1d9bb236b14187c611eed0d149f0acc..5ef13b38c8a86e16cefdc97be6934b313fdb7bc4 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -132,9 +132,9 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
     auto &dev_ctx = context.device_context<DeviceContext>();
 
     // call new kernel
-    phi::FlattenKernel<T, typename paddle::framework::ConvertToPtenContext<
+    phi::FlattenKernel<T, typename paddle::framework::ConvertToPhiContext<
                               DeviceContext>::TYPE>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE &>(dev_ctx),
         *in, start_axis, stop_axis, out);
   }
@@ -153,9 +153,9 @@ class FlattenContiguousRangeGradKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.device_context<DeviceContext>();
 
     // call new kernel
-    phi::FlattenGradKernel<T, typename paddle::framework::ConvertToPtenContext<
+    phi::FlattenGradKernel<T, typename paddle::framework::ConvertToPhiContext<
                                   DeviceContext>::TYPE>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE &>(dev_ctx),
         *d_out, *xshape, d_x);
   }
diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc
index 3f6171b8a07b023b547caaaff31f04a988885bd9..fc03ef0afae51ec2c55ebf6f5a36c57b089093a9 100644
--- a/paddle/fluid/operators/flip_op.cc
+++ b/paddle/fluid/operators/flip_op.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/flip_op.h"
 #include <string>
 #include <unordered_map>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -29,6 +29,7 @@ class FlipOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
+  // TODO move to phi kernel
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"), true,
@@ -150,14 +151,6 @@ namespace plat = paddle::platform;
 REGISTER_OPERATOR(flip, ops::FlipOp, ops::FlipOpMaker, ops::FlipOpInferVarType,
                   ops::FlipOpGradMaker<paddle::framework::OpDesc>,
                   ops::FlipOpGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    flip, ops::FlipKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, plat::complex<float>>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, plat::complex<double>>);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(flip)
diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu
deleted file mode 100644
index b9f8b16214fe476622263f914c7e818bef91ba92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/flip_op.cu
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/flip_op.h"
-
-#include <vector>
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/complex.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-__global__ void flip_cuda_kernel(const int N, const T* in_data, T* out_data,
-                                 int64_t* x_shape, int64_t* x_stride,
-                                 int* flip_dims, int flip_dims_size,
-                                 int total_dims) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int cur_indices = idx, rem = 0, dst_offset = 0;
-  for (int i = 0; i < total_dims; ++i) {
-    int64_t temp = cur_indices;
-    cur_indices = cur_indices / x_stride[i];
-    rem = temp - cur_indices * x_stride[i];
-    // flip the indices if it is in flip_dims
-    for (int j = 0; j < flip_dims_size; ++j) {
-      if (i == flip_dims[j]) {
-        cur_indices = x_shape[i] - 1 - cur_indices;
-      }
-    }
-    dst_offset += cur_indices * x_stride[i];
-    cur_indices = rem;
-  }
-  out_data[idx] = in_data[dst_offset];
-}
-
-template <typename T>
-class FlipKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto gplace = ctx.GetPlace();
-    auto cplace = platform::CPUPlace();
-    auto& dev_ctx = ctx.template device_context<CUDADeviceContext>();
-
-    const Tensor* x = ctx.Input<Tensor>("X");
-    Tensor* out = ctx.Output<Tensor>("Out");
-    auto* in_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto flip_dims = ctx.template Attr<std::vector<int>>("axis");
-
-    const int flip_dims_size = static_cast<int>(flip_dims.size());
-    auto x_dims = x->dims();
-    const int total_dims = x_dims.size();
-    const int N = x->numel();
-
-    int block_size = 512;
-    dim3 dim_block(block_size);
-    dim3 dim_grid((N + block_size - 1) / block_size);
-
-    for (size_t i = 0; i < flip_dims.size(); ++i) {
-      if (flip_dims[i] < 0) {
-        flip_dims[i] += total_dims;
-      }
-    }
-
-    auto x_stride = phi::stride(x_dims);
-    std::vector<int64_t> x_dims_v = phi::vectorize(x_dims);
-    std::vector<int64_t> x_stride_v = phi::vectorize(x_stride);
-
-    int bytes = total_dims * sizeof(int64_t);
-    auto x_strides_array_tmp = memory::Alloc(dev_ctx, bytes);
-    int64_t* x_strides_array_gpu =
-        reinterpret_cast<int64_t*>(x_strides_array_tmp->ptr());
-    memory::Copy(gplace, x_strides_array_gpu, cplace, x_stride_v.data(), bytes,
-                 dev_ctx.stream());
-
-    auto x_shape_array_tmp = memory::Alloc(dev_ctx, bytes);
-    int64_t* x_shape_array_gpu =
-        reinterpret_cast<int64_t*>(x_shape_array_tmp->ptr());
-    memory::Copy(gplace, x_shape_array_gpu, cplace, x_dims_v.data(), bytes,
-                 dev_ctx.stream());
-
-    bytes = flip_dims_size * sizeof(int);
-    auto flip_dims_array_tmp = memory::Alloc(dev_ctx, bytes);
-    int* flip_dims_array_gpu =
-        reinterpret_cast<int*>(flip_dims_array_tmp->ptr());
-    memory::Copy(gplace, flip_dims_array_gpu, cplace, flip_dims.data(), bytes,
-                 dev_ctx.stream());
-
-    flip_cuda_kernel<
-        T><<<dim_grid, dim_block, 0, ctx.cuda_device_context().stream()>>>(
-        N, in_data, out_data, x_shape_array_gpu, x_strides_array_gpu,
-        flip_dims_array_gpu, flip_dims_size, total_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    flip, ops::FlipKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, plat::complex<float>>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext,
-                    plat::complex<double>>);
diff --git a/paddle/fluid/operators/flip_op.h b/paddle/fluid/operators/flip_op.h
deleted file mode 100644
index 3c00df5f67d19a9a58a3fe2f4ed2f64f34128063..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/flip_op.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <bitset>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-constexpr size_t dim_bitset_size = 64;
-
-template <typename DeviceContext, typename T>
-class FlipKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename T>
-class FlipKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    Tensor* out = ctx.Output<Tensor>("Out");
-    auto flip_dims = ctx.template Attr<std::vector<int>>("axis");
-
-    auto x_dims = x->dims();
-    const int total_dims = x_dims.size();
-    std::bitset<dim_bitset_size> dim_bitset;
-    for (size_t i = 0; i < flip_dims.size(); ++i) {
-      int dim = flip_dims[i];
-      if (flip_dims[i] < 0) {
-        dim += total_dims;
-      }
-      dim_bitset[dim] = true;
-    }
-    auto x_strides = phi::stride(x_dims);
-    auto numel = x->numel();
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (int64_t i = 0; i < numel; ++i) {
-      int64_t cur_indices = i;
-      int64_t rem = 0;
-      int64_t dst_offset = 0;
-
-      for (int d = 0; d < total_dims; ++d) {
-        int64_t temp = cur_indices;
-        cur_indices = cur_indices / x_strides[d];
-        rem = temp - cur_indices * x_strides[d];
-        dst_offset += dim_bitset[d]
-                          ? (x_dims[d] - 1 - cur_indices) * x_strides[d]
-                          : cur_indices * x_strides[d];
-        cur_indices = rem;
-      }
-      out_data[i] = x_data[dst_offset];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 7b97663c387ca0636e989f4ccb0d9223fb969f44..40ec9aef190ff4bacd52b19a1c0b12300a35b61e 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -13,7 +13,6 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/fold_op.h"
-#include "paddle/fluid/operators/unfold_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 31fff4b668d543bfa080a4adce2ca9b6f564012a..020277675797358bf87a58ac108e6eaaddb26ccc 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_impl.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
 namespace operators {
@@ -123,11 +123,11 @@ class FMHARef {
                                                      T, T>(
           dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
 
-      SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *src_mask_out_tensor,
-                                        softmax_axis, softmax_out_tensor);
+      phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *src_mask_out_tensor,
+                                             softmax_axis, softmax_out_tensor);
     } else {
-      SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out_tensor, softmax_axis,
-                                        softmax_out_tensor);
+      phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out_tensor,
+                                             softmax_axis, softmax_out_tensor);
     }
 
     transB = CblasNoTrans;
@@ -251,9 +251,9 @@ class FMHARef {
     }
 
     if (src_mask_tensor != nullptr) {
-      SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, softmax_out_tensor,
-                                         *softmax_out_grad_tensor, softmax_axis,
-                                         src_mask_out_grad_tensor);
+      phi::SoftmaxBackwardCUDAKernelDriver<T>(
+          dev_ctx_, softmax_out_tensor, *softmax_out_grad_tensor, softmax_axis,
+          src_mask_out_grad_tensor);
 
       // recall LaunchElementwiseCudaKernel fw:  src_mask_out = qk_out +
       // src_mask
@@ -272,9 +272,9 @@ class FMHARef {
       }
 
     } else {
-      SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, softmax_out_tensor,
-                                         *softmax_out_grad_tensor, softmax_axis,
-                                         qk_out_grad_tensor);
+      phi::SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, softmax_out_tensor,
+                                              *softmax_out_grad_tensor,
+                                              softmax_axis, qk_out_grad_tensor);
     }
 
     T* qk_out_grad_data = qk_out_grad_tensor->data<T>();
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 962af435b2312cf876c27e005d19f366d965b1fc..13f1c6808aef2e0873c5ce6493514c47710dcf16 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -34,9 +34,9 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
     int input_num = static_cast<int>(ids.size());
 
     framework::Tensor in_ids_(
-        framework::TransToPtenDataType(framework::proto::VarType::INT64)),
+        framework::TransToPhiDataType(framework::proto::VarType::INT64)),
         in_embs_(
-            framework::TransToPtenDataType(framework::proto::VarType::INT64));
+            framework::TransToPhiDataType(framework::proto::VarType::INT64));
     framework::DDim in_dim{input_num};
     int device_id;
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index 177e8f5bcb7bdd1af907c397bfb75db8dd014d88..0ffc4c91b851c12a5329ae5b27bd3300753896a9 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <iostream>
 #include <memory>
 #include "dnnl.hpp"
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/multi_gru_op.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index fa9fe9d8602012f71ca6829e58561d03b7bfb2f1..21d827c79200c4a368ce7677b01b18ee4ddedb8d 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/operators/index_impl.cu.h"
 
 DECLARE_bool(use_curand);
 
@@ -65,7 +66,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
     auto shape = GetShape(context);
     tensor->Resize(shape);
 
@@ -88,15 +88,13 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
       } else {
         auto seed_offset = gen_cuda->IncrementOffset(1);
         int64_t gen_offset = size * seed_offset.second;
-        thrust::transform(
-            index_sequence_begin, index_sequence_begin + size,
-            thrust::device_ptr<T>(data),
-            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+        auto func =
+            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
+        IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
       }
     } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GaussianGenerator<T>(mean, std, seed));
+      auto func = GaussianGenerator<T>(mean, std, seed);
+      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     }
   }
 };
@@ -116,23 +114,22 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
     int64_t size = tensor->numel();
 
     int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    auto& dev_cxt =
+        context.template device_context<platform::CUDADeviceContext>();
 
     if (gen_cuda->GetIsInitPy() && seed_flag) {
       auto seed_offset = gen_cuda->IncrementOffset(1);
       int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GaussianGenerator<T>(mean, std, seed_offset.first,
-                                             seed_offset.second));
+      auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
+                                       seed_offset.second);
+      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GaussianGenerator<T>(mean, std, seed));
+      auto func = GaussianGenerator<T>(mean, std, seed);
+      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     }
   }
 };
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 95bf96073bdd21ddcadd5e447ba38ecb8dd21b83..a227a8e312765b4311314ea884f2c32443924fbc 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/grid_sampler_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
@@ -292,15 +293,12 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
     auto* output_data = output->mutable_data<T>(ctx.GetPlace());
     VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
             << "; " << output->dims()[2] << "; " << output->dims()[3];
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
-        dev_ctx, output, static_cast<T>(0));
     int count = static_cast<int>(n * out_h * out_w);
     auto cu_stream = dev_ctx.stream();
-    int block_size = 512;
-    int grid_size = (count + block_size - 1) / block_size;
-    VLOG(3) << "cuda launch - grid dims: " << grid_size << "; block dims"
-            << block_size;
-    grid_sample_cuda_kernel<T><<<grid_size, block_size, 0, cu_stream>>>(
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, count);
+    grid_sample_cuda_kernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
         count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
         grid->data<T>(), output_data, mode, padding_mode, align_corners);
   }
@@ -467,19 +465,14 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
     if (ctx.HasOutput(framework::GradVarName("Grid"))) {
       auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
       grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
-          ctx.template device_context<paddle::platform::CUDADeviceContext>(),
-          grid_grad, static_cast<T>(0));
     }
 
     int count = static_cast<int>(n * out_h * out_w);
     auto cu_stream = dev_ctx.stream();
-    int block_size = 512;
-    int grid_size = (count + block_size - 1) / block_size;
-    VLOG(3) << "cuda launch grad kernel - grid dims: " << grid_size
-            << "; block dims" << block_size << "; count: " << count;
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, count);
     grid_sampler_cuda_backward_kernel<
-        T><<<grid_size, block_size, 0, cu_stream>>>(
+        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
         count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
         out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
         padding_mode, align_corners);
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc
index 95c6ed6690541e73ed76cf51b78cd0f94c115035..f8f8f3fd789ad61a99bcc17bc073b6cfd099f639 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cc
+++ b/paddle/fluid/operators/gumbel_softmax_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gumbel_softmax_op.h"
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,10 +24,6 @@ class GumbelSoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    return UnaryOpUnchangedInferShapeCheckAxis(ctx);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -71,20 +68,6 @@ Samples from the Gumbel-Softmax distribution and optionally discretizes.
 class GumbelSoftmaxGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "gumbel_softmax_grad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
-                   "Out@GRAD", "gumbel_softmax_grad");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Out"),
-        ctx->GetInputDim(framework::GradVarName("Out")),
-        platform::errors::InvalidArgument("Input(Out) and its gradients "
-                                          "should have the same shape."));
-
-    ctx->SetOutputDim(framework::GradVarName("X"),
-                      ctx->GetInputDim(framework::GradVarName("Out")));
-  }
 };
 
 template <typename T>
@@ -107,17 +90,16 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
+                            PT_INFER_META(phi::GumbelSoftmaxInferMeta));
+DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
+                            GumbelSoftmaxGradInferShapeFunctor,
+                            PT_INFER_META(phi::GumbelSoftmaxGradInferMeta));
+
 REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp,
                   ops::GumbelSoftmaxOpMaker,
                   ops::GumbelSoftmaxGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GumbelSoftmaxGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(gumbel_softmax_grad, ops::GumbelSoftmaxGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    gumbel_softmax,
-    ops::GumbelSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GumbelSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    gumbel_softmax_grad,
-    ops::GumbelSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GumbelSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::GumbelSoftmaxGradOpMaker<paddle::imperative::OpBase>,
+                  GumbelSoftmaxInferShapeFunctor);
+REGISTER_OPERATOR(gumbel_softmax_grad, ops::GumbelSoftmaxGradOp,
+                  GumbelSoftmaxGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu
deleted file mode 100644
index 880e3eb9f3f9a9c68392f5ea9cc5ab9465676a3a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/gumbel_softmax_op.h"
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename K, typename V>
-using KeyValuePair = cub::KeyValuePair<K, V>;
-
-template <typename T>
-struct UniformCUDAGenerator {
-  T min_, max_;
-  unsigned int seed_;
-  unsigned int offset_ = 0;
-  HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed)
-      : min_(min), max_(max), seed_(seed) {}
-  HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed,
-                                  unsigned int offset)
-      : min_(min), max_(max), seed_(seed), offset_(offset) {}
-
-  HOSTDEVICE T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    return dist(rng);
-  }
-};
-
-template <typename T, size_t BlockDim>
-__global__ void OneHotCUDAKernel(const int64_t height, const int64_t width,
-                                 const int64_t size_out_axis, const T init,
-                                 const T* in, T* out) {
-  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int64_t idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / size_out_axis;
-    int w = idx % size_out_axis;
-    cub::ArgMax reducer;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair = reducer(
-          {k, in[h * width * size_out_axis + k * size_out_axis + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      int index = static_cast<int>(kv_pair.key);
-      out[h * width * size_out_axis + index * size_out_axis + w] = 1;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-struct OneHotGenerator<platform::CUDADeviceContext, T> {
-  static void Transform(const platform::CUDADeviceContext& context,
-                        const Tensor& X, Tensor* Out, int axis) {
-    const int size_to_axis = SizeToAxis(axis, X.dims());
-    const int size_from_axis = SizeFromAxis(axis, X.dims());
-    const int size_out_axis = SizeOutAxis(axis, X.dims());
-    constexpr int thread_size = 512;
-    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
-    int64_t height = size_to_axis * size_out_axis;
-    int block_size = height < max_grid_dimx ? height : max_grid_dimx;
-
-    Tensor input_tensor;
-    input_tensor.mutable_data<T>(Out->dims(), platform::CUDAPlace());
-    paddle::framework::TensorCopy(*Out, context.GetPlace(), &input_tensor);
-    phi::funcs::set_constant(context, Out, 0.0);
-    OneHotCUDAKernel<
-        T, thread_size><<<block_size, thread_size, 0, context.stream()>>>(
-        height, size_from_axis / size_out_axis, size_out_axis,
-        std::numeric_limits<T>::lowest(), input_tensor.data<T>(),
-        Out->data<T>());
-  }
-};
-
-template <typename T>
-__global__ void AddGumbelNoiseCUDAKernel(const T* input_data, T* output_data,
-                                         T* noise, const float temperature,
-                                         int64_t n) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  int step = blockDim.x * gridDim.x;
-  for (int64_t i = index; i < n; i += step) {
-    T gumbel_noise = -log(-log(noise[i]));
-    output_data[i] = (gumbel_noise + input_data[i]) / temperature;
-  }
-}
-
-template <typename T>
-struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> {
-  static void Transform(const platform::CUDADeviceContext& context,
-                        const T* input_data, T* output_data, int size_to_axis,
-                        int size_from_axis, const float temperature) {
-    Tensor random_tensor;
-    int64_t size = size_to_axis * size_from_axis;
-    T* random_data =
-        random_tensor.mutable_data<T>({size}, platform::CUDAPlace());
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-
-    // generate gumbel noise
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy()) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(random_data),
-          UniformCUDAGenerator<T>(0.00001, 1, seed_offset.first, gen_offset));
-    } else {
-      const unsigned int seed = std::random_device()();
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(random_data),
-                        UniformCUDAGenerator<T>(0.00001, 1, seed));
-    }
-
-    // add gumbel noise to X
-    const int thread_size = 512;
-    int64_t block_size = (size + thread_size) / thread_size;
-    AddGumbelNoiseCUDAKernel<
-        T><<<block_size, thread_size, 0, context.stream()>>>(
-        input_data, output_data, random_data, temperature, size);
-  }
-};
-
-#endif
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    gumbel_softmax, ops::GumbelSoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::GumbelSoftmaxKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    gumbel_softmax_grad,
-    ops::GumbelSoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::GumbelSoftmaxGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/gumbel_softmax_op.h b/paddle/fluid/operators/gumbel_softmax_op.h
deleted file mode 100644
index daddd13d7be5e3d7c742a0fa4def3b1828eb27ce..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gumbel_softmax_op.h
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D>;
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline int SizeToAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeFromAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeOutAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis + 1; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename DeviceContext, typename T, int64_t Rank>
-struct ArgMaxFunctor {
-  void operator()(const DeviceContext& ctx, const Tensor& in,
-                  Tensor* index_tensor, const int64_t& axis) {
-    auto in_eigen = EigenTensor<T, Rank>::From(in, in.dims());
-    auto index_eigen = EigenTensor<int, Rank - 1>::From(*index_tensor);
-    index_eigen = in_eigen.argmax(axis).template cast<int>();
-  }
-};
-template <typename DeviceContext, typename T>
-struct GumbleNoiseGenerator;
-
-template <typename DeviceContext, typename T>
-struct OneHotGenerator;
-
-template <typename T>
-struct GumbleNoiseGenerator<platform::CPUDeviceContext, T> {
-  static void Transform(const platform::CPUDeviceContext& context,
-                        const T* input_data, T* output_data, int size_to_axis,
-                        int size_from_axis, const float temperature) {
-    // generate uniform random number
-    const int size = size_to_axis * size_from_axis;
-    std::uniform_real_distribution<T> dist(0.00001, 1);
-    auto engine = paddle::framework::GetCPURandomEngine(0);
-    Tensor random_tensor;
-    auto* random_data =
-        random_tensor.mutable_data<T>({size}, platform::CPUPlace());
-    for (int64_t i = 0; i < size; ++i) {
-      random_data[i] = dist(*engine);
-    }
-
-    // generate gumbel noise
-    framework::DDim dim_2d{size_to_axis, size_from_axis};
-    auto gumbel_noise_eigen = EigenMatrix<T>::From(random_tensor, dim_2d);
-    gumbel_noise_eigen = -(((-(gumbel_noise_eigen.log())).log()));
-
-    // add noise
-    for (int64_t i = 0; i < size_to_axis * size_from_axis; i++) {
-      output_data[i] = (input_data[i] + random_data[i]) / temperature;
-    }
-  }
-};
-template <typename T>
-struct OneHotGenerator<platform::CPUDeviceContext, T> {
-  static void Transform(const platform::CPUDeviceContext& context,
-                        const Tensor& X, Tensor* Out, int axis) {
-    Tensor index;
-    std::vector<int> index_dim;
-    const auto rank = X.dims().size();
-    const int size_to_axis = SizeToAxis(axis, X.dims());
-    const int size_from_axis = SizeFromAxis(axis, X.dims());
-    const int size_out_axis = SizeOutAxis(axis, X.dims());
-
-    for (int i = 0; i < X.dims().size(); i++) {
-      if (i != axis) index_dim.push_back(X.dims().Get()[i]);
-    }
-    DDim index_ddim(index_dim.data(), rank - 1);
-    index.Resize(index_ddim);
-    auto* index_data = index.mutable_data<int>(context.GetPlace());
-
-#define CALL_ARG_MINMAX_FUNCTOR(rank)                               \
-  ArgMaxFunctor<platform::CPUDeviceContext, T, rank> functor##rank; \
-  functor##rank(context, *Out, &index, axis);
-    switch (Out->dims().size()) {
-      case 1:
-        CALL_ARG_MINMAX_FUNCTOR(1);
-        break;
-      case 2:
-        CALL_ARG_MINMAX_FUNCTOR(2);
-        break;
-      case 3:
-        CALL_ARG_MINMAX_FUNCTOR(3);
-        break;
-      case 4:
-        CALL_ARG_MINMAX_FUNCTOR(4);
-        break;
-      case 5:
-        CALL_ARG_MINMAX_FUNCTOR(5);
-        break;
-      case 6:
-        CALL_ARG_MINMAX_FUNCTOR(6);
-        break;
-      default:
-        PADDLE_ENFORCE_LE(Out->dims().size(), 6,
-                          platform::errors::InvalidArgument(
-                              "gumbel_softmax operator doesn't supports "
-                              "tensors whose ranks are greater "
-                              "than 6 in CPU mode."));
-        break;
-#undef CALL_ARG_MINMAX_FUNCTOR
-    }
-
-    phi::funcs::set_constant(context, Out, 0.0);
-    for (int i = 0; i < size_to_axis; i++) {
-      for (int j = 0; j < size_out_axis; j++) {
-        *(Out->data<T>() + i * size_from_axis + j +
-          index_data[i * size_out_axis + j] * size_out_axis) = 1.0;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GumbelSoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<Tensor>("X");
-    auto* Out = context.Output<Tensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = X->dims()[axis];
-    const bool is_hard = context.Attr<bool>("hard");
-    const float temperature = context.Attr<float>("temperature");
-    PADDLE_ENFORCE_GT(temperature, 0,
-                      platform::errors::InvalidArgument(
-                          "The temperature must be greater than 0. But "
-                          "received temperature = %f",
-                          temperature));
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-    if (Out->numel() == 0) {
-      return;
-    }
-
-    const int size_to_axis = SizeToAxis(axis, X->dims());
-    const int size_from_axis = SizeFromAxis(axis, X->dims());
-    Tensor X_noise_2d, Out_2d;
-    X_noise_2d.Resize({size_to_axis, size_from_axis});
-    Out_2d.ShareDataWith(*Out).Resize({size_to_axis, size_from_axis});
-
-    // generate gumbel noise and add it to X
-    auto* x_noise_data = X_noise_2d.mutable_data<T>(context.GetPlace());
-    GumbleNoiseGenerator<DeviceContext, T>::Transform(
-        context.template device_context<DeviceContext>(), X->data<T>(),
-        x_noise_data, size_to_axis, size_from_axis, temperature);
-
-#ifdef PADDLE_ON_INFERENCE
-    math::SoftmaxFunctor<DeviceContext, T, true>()(
-        context.template device_context<DeviceContext>(), axis_dim, &X_noise_2d,
-        &Out_2d);
-#else
-    math::SoftmaxFunctor<DeviceContext, T, false>()(
-        context.template device_context<DeviceContext>(), axis_dim, &X_noise_2d,
-        &Out_2d);
-#endif
-
-    if (is_hard) {
-      OneHotGenerator<DeviceContext, T>::Transform(
-          context.template device_context<DeviceContext>(), *X, Out, axis);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GumbelSoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<Tensor>("Out");
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
-    const int rank = dX->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = dX->dims()[axis];
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-    if (dX->numel() == 0) {
-      return;
-    }
-
-    const int size_to_axis = SizeToAxis(axis, dX->dims());
-    const int size_from_axis = SizeFromAxis(axis, dX->dims());
-    Tensor dX_2d, Out_2d, dOut_2d;
-    dX_2d.ShareDataWith(*dX).Resize({size_to_axis, size_from_axis});
-    Out_2d.ShareDataWith(*Out).Resize({size_to_axis, size_from_axis});
-    dOut_2d.ShareDataWith(*dOut).Resize({size_to_axis, size_from_axis});
-    math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), axis_dim, &Out_2d,
-        &dOut_2d, &dX_2d);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 041f7487fd2575faa2407ea90c064a2cfdea96c5..3915ce5809c394738c58e80accccac531c268c23 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -12,47 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/huber_loss_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
 class HuberLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "HuberLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "HuberLoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(input) rank and Input(label) rank should be "
-                          "same, but received input rank(%d) != label rank(%d)",
-                          x_dims.size(), y_dims.size()));
-
-    bool contain_unknown_dim =
-        phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(y_dims);
-    if (ctx->IsRuntime() || !contain_unknown_dim) {
-      PADDLE_ENFORCE_EQ(
-          x_dims, y_dims,
-          platform::errors::InvalidArgument(
-              "The Input(input) and Input(label) should have the same "
-              "shape, but received input shape [%s] != label shape [%s]",
-              x_dims, y_dims));
-    }
-
-    auto out_dims = y_dims;
-    ctx->SetOutputDim("Residual", out_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 template <typename AttrType>
@@ -139,14 +112,11 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
+                            PT_INFER_META(phi::HuberLossInferMeta));
+
 REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                   ops::HuberLossGradOpMaker<paddle::framework::OpDesc>,
-                  ops::HuberLossGradOpMaker<paddle::imperative::OpBase>);
+                  ops::HuberLossGradOpMaker<paddle::imperative::OpBase>,
+                  HuberLossInferShapeFunctor);
 REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
deleted file mode 100644
index 4ce6856a7eade1b314d8aef1d039424ad42e07cf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/huber_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
deleted file mode 100644
index ebe26f05ab3e47245176614fb2ce57c264ebf5f5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/huber_loss_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-struct HuberLossForward {
-  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return static_cast<T>(0.5) * val * val;
-    } else {
-      return delta * (abs_val - static_cast<T>(0.5) * delta);
-    }
-  }
-
-  T delta;
-};
-
-template <typename DeviceContext, typename T>
-class HuberLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<Tensor>("Residual");
-    auto* out1 = context.Output<Tensor>("Out");
-    auto delta = static_cast<T>(context.Attr<float>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    out0->mutable_data<T>(context.GetPlace());
-    auto residual = EigenVector<T>::Flatten(*out0);
-    residual.device(place) = y - x;
-    out1->mutable_data<T>(context.GetPlace());
-    auto loss = EigenVector<T>::Flatten(*out1);
-    loss.device(place) = residual.unaryExpr(HuberLossForward<T>(delta));
-  }
-};
-
-template <typename T>
-struct HuberLossBackward {
-  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
-      : sign(sign), delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return sign * val;
-    } else {
-      if (val > 0) {
-        return sign * delta;
-      } else {
-        return -1 * sign * delta;
-      }
-    }
-  }
-
-  T sign;
-  T delta;
-};
-
-template <typename DeviceContext, typename T>
-class HuberLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Residual");
-    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto delta = static_cast<T>(context.Attr<float>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto residual = EigenVector<T>::Flatten(*in0);
-    auto out_grad = EigenVector<T>::Flatten(*in1);
-
-    if (out0) {
-      out0->mutable_data<T>(context.GetPlace());
-      auto x_grad = EigenVector<T>::Flatten(*out0);
-      x_grad.device(place) =
-          residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
-      x_grad.device(place) = out_grad * x_grad;
-    }
-
-    if (out1) {
-      out1->mutable_data<T>(context.GetPlace());
-      auto y_grad = EigenVector<T>::Flatten(*out1);
-      y_grad.device(place) =
-          residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
-      y_grad.device(place) = out_grad * y_grad;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
index 19ced131c00a2a861a5140697b8a199f013ad5bf..6fc6960d3db565d698b252347e5734f949e16211 100644
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/huber_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/huber_loss_op_xpu.cc b/paddle/fluid/operators/huber_loss_op_xpu.cc
index 767ce542736e831e2ea587fc765ed6c0baf96589..ccddec2779515f26db10440633ab9d9894537182 100644
--- a/paddle/fluid/operators/huber_loss_op_xpu.cc
+++ b/paddle/fluid/operators/huber_loss_op_xpu.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/huber_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 6a195bb9400e89ef09bc7ca2c08637eeb505dda2..33b68d68992dd819f74c2ae67153ecc6b050b16b 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/imag_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,15 +23,6 @@ namespace operators {
 class ImagOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Imag");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Imag");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class ImagOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -88,19 +82,13 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(imag, ops::ImagOp, ops::ImagOpMaker,
                   ops::ImagGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ImagGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ImagGradOpMaker<paddle::imperative::OpBase>,
+                  ImagInferShapeFunctor);
 REGISTER_OPERATOR(imag_grad, ops::ImagGradOp);
-
-REGISTER_OP_CPU_KERNEL(imag, ops::ImagKernel<paddle::platform::CPUDeviceContext,
-                                             paddle::platform::complex<float>>,
-                       ops::ImagKernel<paddle::platform::CPUDeviceContext,
-                                       paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(imag_grad,
-                       ops::ImagGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex<float>>,
-                       ops::ImagGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/imag_op.cu b/paddle/fluid/operators/imag_op.cu
deleted file mode 100644
index 9cfb2ef7f2fef6b25322ba76bedadae3c6ca8d87..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/imag_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/imag_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(imag,
-                        ops::ImagKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-                        ops::ImagKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(imag_grad,
-                        ops::ImagGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex<float>>,
-                        ops::ImagGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/imag_op.h b/paddle/fluid/operators/imag_op.h
deleted file mode 100644
index 33eab2abb74e177eb863989cd6a1e8132ad09e8c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/imag_op.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ImagKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<phi::funcs::Real<T>>(
-        ctx.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ImagGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<phi::funcs::Real<T>>();
-    auto* dx_data = d_x->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::ImagToComplexFunctor<T> functor(dout_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index c572870d950a8200dc3398e5e1e5c5ab28d2332b..105d818e197434c4ed85126228e06d45bf06e498 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/increment_op.h"
-
-#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace framework {
@@ -39,18 +40,6 @@ class IncrementOp : public framework::OperatorWithKernel {
               const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("X")), 1UL,
-                      platform::errors::InvalidArgument(
-                          "The number of elements in Input(X) should be 1."
-                          "Now the number is %d.",
-                          phi::product(ctx->GetInputDim("X"))));
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "increment");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "increment");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -98,17 +87,9 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
+                            PT_INFER_META(phi::IncrementInferMeta));
 REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
                   ops::IncrementGradOpMaker<paddle::framework::OpDesc>,
-                  ops::IncrementGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
+                  ops::IncrementGradOpMaker<paddle::imperative::OpBase>,
+                  IncrementInferShapeFunctor);
diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h
deleted file mode 100644
index 4b9d07146484ff00ba105b9971f40f91dd8148de..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/increment_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IncrementKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_tensor = context.Input<framework::Tensor>("X");
-    auto* out_tensor = context.Output<framework::Tensor>("Out");
-    float step = context.Attr<float>("step");
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenAdd<std::decay_t<decltype(dev)>, T>::Eval(
-        dev, framework::EigenScalar<T>::From(*out_tensor),
-        framework::EigenScalar<T>::From(*x_tensor), static_cast<T>(step));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
index 1c7c8a19110bc8e9e39b95478e4f06ff0eb50ef9..16f1b3b1269952b11f611e6c6988ed3199977994 100644
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/increment_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e3e6569ef5a88f8dfcb6646974b70bcc6c0c95f
--- /dev/null
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/distribution_helper.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+namespace kps = phi::kps;
+template <typename T, typename Functor, int VecSize>
+__global__ void VectorizedIndexKernel(T *out, size_t numel, size_t main_offset,
+                                      Functor func) {
+  size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  size_t args[VecSize];
+  T result[VecSize];
+  for (; data_offset < main_offset; data_offset += stride) {
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(&result[0],
+                                                             &args[0], func);
+    kps::WriteData<T, VecSize, 1, 1, false>(out + data_offset, &result[0],
+                                            BLOCK_NUM_X * VecSize);
+  }
+  size_t num = numel - data_offset;
+  if (num > 0) {
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(&result[0],
+                                                             &args[0], func);
+    kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
+  }
+}
+
+template <typename T, typename Functor>
+void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
+  int numel = out->numel();
+  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+  if (numel <= 0) return;
+  int vec_size = paddle::platform::GetVectorizedSize(out_data);
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  int grid = 8;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
+  int grid = config.block_per_grid.x;
+  int block = config.thread_per_block.x;
+  auto stream = dev_ctx.stream();
+#endif
+  size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
+  switch (vec_size) {
+    case 4:
+      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 2:
+      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 1:
+      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 8fac84176d97fd371ddfac25dab2aee8c098607a..fda168c94e1e064c65e3b5fcf56b606772345b9d 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -328,7 +328,7 @@ class InterpolateOp : public framework::OperatorWithKernel {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    auto interp_method = ctx.Attr<std::string>("interp_method");
+    const auto& interp_method = ctx.Attr<std::string>("interp_method");
     // TODO(danqing): support other interp_method
     if (this->CanMKLDNNBeUsed(ctx, data_type) &&
         (interp_method == "nearest" || interp_method == "bilinear")) {
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 7783303785998e9db05a5f5117a047e2729de848..4b5a18141d5aa9ac5d1f5354fafbad0e38bb8474 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -414,7 +414,7 @@ class InterpolateV2Op : public framework::OperatorWithKernel {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    auto interp_method = ctx.Attr<std::string>("interp_method");
+    const auto& interp_method = ctx.Attr<std::string>("interp_method");
     // TODO(danqing): support other interp_method
     if (this->CanMKLDNNBeUsed(ctx, data_type) &&
         (interp_method == "nearest" || interp_method == "bilinear")) {
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 6e9d6a1995474812abe333137cc75ba90a2b4fac..d61eb46d97e98972963f5871a4c6e7b06468337c 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -210,32 +210,66 @@ __global__ void KeNearestNeighbor3DInterpFw(
   }
 }
 
+template <typename T>
+__global__ void KeNearestNeighborInterpNCHWBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const T* out,
+    const size_t out_img_h, const size_t out_img_w, const size_t nc,
+    const float ratio_h, const float ratio_w, const bool align_corners) {
+  int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y;
+  int nc_id = threadIdx.z + blockIdx.z * blockDim.z;
+  int nc_stride = blockDim.z * gridDim.z;
+
+  // nearest_sampling by multiple read in_addr and write to out_addr
+  int in_img_idx = (align_corners)
+                       ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                       : static_cast<int>(ratio_w * out_img_idx);
+  int in_img_idy = (align_corners)
+                       ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                       : static_cast<int>(ratio_h * out_img_idy);
+
+  int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx;
+  int in_index_stride = nc_stride * in_img_h * in_img_w;
+
+  int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx;
+  int out_index_stride = nc_stride * out_img_h * out_img_w;
+
+  // prevent from multiple threads writing
+  if (out_img_idx < out_img_w && out_img_idy < out_img_h) {
+    while (nc_id < nc) {
+      T* in_pos = &in[in_index];
+      const T out_pos = out[out_index];
+      platform::CudaAtomicAdd(in_pos, out_pos);
+      in_index += in_index_stride;
+      out_index += out_index_stride;
+      nc_id += nc_stride;
+    }
+  }
+}
+
 template <typename T>
 __global__ void KeNearestNeighborInterpBw(
     T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
     const size_t input_w, const T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
     const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners, const DataLayout data_layout) {
+    const bool align_corners, FastDivModForInterpolate divmods) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
+  int in_img_size = in_img_h * in_img_w;
+  int out_img_size = out_img_h * out_img_w;
+
   for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
+    auto out_id_divmod = divmods.output_w_div.Divmod(tid);
+    int out_id_h = out_id_divmod.val[0];
+    int out_id_w = out_id_divmod.val[1];
 
-    int channel_id, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idy = (out_id_w % out_img_size) / out_img_w;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idy = out_id_w / (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
-    }
+    int channel_id = divmods.channels_div.Divmod(tid).val[1];
+    auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w);
+    int out_img_idy = outimg_id_divmod.val[0];
+    int out_img_idx =
+        divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0];
 
     int in_img_idy = (align_corners)
                          ? static_cast<int>(ratio_h * out_img_idy + 0.5)
@@ -244,15 +278,10 @@ __global__ void KeNearestNeighborInterpBw(
                          ? static_cast<int>(ratio_w * out_img_idx + 0.5)
                          : static_cast<int>(ratio_w * out_img_idx);
 
-    T* in_pos;
-    if (data_layout == DataLayout::kNCHW) {
-      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
-                   in_img_idy * in_img_w + in_img_idx];
-    } else {
-      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
-                   in_img_idx * num_channels + channel_id];
-    }
-    const T out_pos = out[out_id_h * output_w + out_id_w];
+    T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+
+    const T out_pos = out[tid];
     platform::CudaAtomicAdd(in_pos, out_pos);
   }
 }
@@ -1842,11 +1871,26 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
       platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    if (data_layout == DataLayout::kNCHW) {
+      // get launch 3D config
+      int nc = n * c;
+      platform::GpuLaunchConfig config_3d =
+          GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w);
+      KeNearestNeighborInterpNCHWBw<
+          T><<<config_3d.block_per_grid, config_3d.thread_per_block, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, nc,
+          ratio_h, ratio_w, align_corners);
+    } else {
+      int64_t cw = c * out_w;
+      auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw);
+      KeNearestNeighborInterpBw<
+          T><<<config.block_per_grid, config.thread_per_block, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+          out_w, n, out_chw, c, ratio_h, ratio_w, align_corners,
+          interp_divmods);
+    }
   } else if ("bilinear" == interp_method) {
     const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0;
     bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false;
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 66ab1e14390b33e99b80393d3bddaf9126bca325..f99d3f6c324421534dd51c74c840a8dca5dcedd9 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -65,6 +65,13 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
                                       &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (platform::is_xpu_place(new_data_tensor->place())) {
+    paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
+                                      &cpu_starts_tensor);
+    new_data = cpu_starts_tensor.data<T>();
+  }
 #endif
   vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
   return vec_new_data;
diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
index 66314cb74456d66522a83abd4eb37873ab8bf9f2..850dbe025b9cb5f13db58eaab86ce777ec7b97ae 100644
--- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
@@ -14,7 +14,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/interpolate_op.h"
+#include "paddle/fluid/operators/interpolate_v2_op.h"
 
 #ifdef PADDLE_WITH_XPU
 
@@ -41,18 +41,6 @@ inline std::vector<int> get_new_shape_xpu(
   return vec_new_shape;
 }
 
-template <typename T>
-inline std::vector<T> get_new_data_from_tensor_xpu(
-    const Tensor* new_data_tensor) {
-  std::vector<T> vec_new_data;
-  framework::Tensor cpu_starts_tensor;
-  paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
-                                    &cpu_starts_tensor);
-  auto* new_data = cpu_starts_tensor.data<T>();
-  vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
-  return vec_new_data;
-}
-
 template <typename T>
 class InterpolateV2XPUKernel : public framework::OpKernel<T> {
  public:
@@ -90,7 +78,7 @@ class InterpolateV2XPUKernel : public framework::OpKernel<T> {
       auto scale_tensor = ctx.Input<Tensor>("Scale");
       auto scale = ctx.Attr<std::vector<float>>("scale");
       if (scale_tensor != nullptr) {
-        auto scale_data = get_new_data_from_tensor_xpu<float>(scale_tensor);
+        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
         if (scale_data.size() > 1) {
           scale_h = scale_data[0];
           scale_w = scale_data[1];
@@ -202,7 +190,7 @@ class InterpolateV2GradXPUKernel : public framework::OpKernel<T> {
       auto scale_tensor = ctx.Input<Tensor>("Scale");
       auto scale = ctx.Attr<std::vector<float>>("scale");
       if (scale_tensor != nullptr) {
-        auto scale_data = get_new_data_from_tensor_xpu<float>(scale_tensor);
+        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
         if (scale_data.size() > 1) {
           scale_h = scale_data[0];
           scale_w = scale_data[1];
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index 5ae9fd7a6102893d6e4a16c451c6d017ad70de5f..7e07610db2875d45aa250ab084e0eaf493dc7034 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/label_smooth_op.h"
-
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -152,11 +151,3 @@ REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
                   ops::LabelSmoothGradMaker<paddle::framework::OpDesc>,
                   ops::LabelSmoothGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
-REGISTER_OP_CPU_KERNEL(
-    label_smooth,
-    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    label_smooth_grad,
-    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu
deleted file mode 100644
index f149e104eff624fd6145926aec60350b41de3cdf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/label_smooth_op.h"
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LabelSmoothFunctor {
-  T epsilon;
-  T label_dim;
-
-  __forceinline__ LabelSmoothFunctor(float epsilon_data, int label_dim_data) {
-    epsilon = static_cast<T>(epsilon_data);
-    label_dim = static_cast<T>(label_dim_data);
-  }
-
-  __device__ __forceinline__ T operator()(const T x) const {
-    return (static_cast<T>(1 - epsilon) * x +
-            static_cast<T>(epsilon / label_dim));
-  }
-};
-
-template <typename T>
-struct LabelSmoothGradFunctor {
-  T epsilon;
-
-  __forceinline__ LabelSmoothGradFunctor(float epsilon_data) {
-    epsilon = static_cast<T>(epsilon_data);
-  }
-
-  __device__ __forceinline__ T operator()(const T x) const {
-    return static_cast<T>(1 - epsilon) * x;
-  }
-};
-
-template <typename T>
-__global__ void LabelSmoothRunDistKernel(const int N, const float epsilon,
-                                         const int dist_numel, const T* src,
-                                         const T* dist_data, T* dst) {
-  CUDA_KERNEL_LOOP(idx, N) {
-    int dist_idx = idx % dist_numel;
-    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
-               static_cast<T>(epsilon) * dist_data[dist_idx];
-  }
-}
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
-    auto* in_t = ctx.Input<framework::LoDTensor>("X");
-    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
-    auto label_dim = in_t->dims()[in_t->dims().size() - 1];
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto size_prob = in_t->numel();
-    const T* in_data = in_t->data<T>();
-    T* out_data = out_t->mutable_data<T>(ctx.GetPlace());
-    int threads = 512;
-    int grid = (size_prob + threads - 1) / threads;
-    auto stream = ctx.cuda_device_context().stream();
-    if (dist_t) {
-      auto dist_numel = dist_t->numel();
-      const T* dist_data = dist_t->data<T>();
-      LabelSmoothRunDistKernel<T><<<grid, threads, 0, stream>>>(
-          size_prob, epsilon, dist_numel, in_data, dist_data, out_data);
-
-    } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-
-      std::vector<const framework::Tensor*> ins = {in_t};
-      std::vector<framework::Tensor*> outs = {out_t};
-      auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_in_t->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    std::vector<const framework::Tensor*> ins = {d_out_t};
-    std::vector<framework::Tensor*> outs = {d_in_t};
-    auto functor = LabelSmoothGradFunctor<T>(epsilon);
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    label_smooth,
-    ops::LabelSmoothGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LabelSmoothGPUKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    label_smooth_grad,
-    ops::LabelSmoothGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LabelSmoothGradGPUKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h
deleted file mode 100644
index 6b509eb64cce6d289032d366552f6bb5e6712388..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/label_smooth_op.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LabelSmoothKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
-    auto* in_t = ctx.Input<framework::LoDTensor>("X");
-    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
-    auto label_dim = in_t->dims()[in_t->dims().size() - 1];
-    out_t->mutable_data<T>(ctx.GetPlace());
-    if (label_dim != 0) {
-      auto epsilon = ctx.Attr<float>("epsilon");
-      auto out = framework::EigenVector<T>::Flatten(*out_t);
-      auto in = framework::EigenVector<T>::Flatten(*in_t);
-      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-      if (dist_t) {
-        auto dist = framework::EigenVector<T>::Flatten(*dist_t);
-        out.device(dev) = static_cast<T>(1 - epsilon) * in +
-                          static_cast<T>(epsilon) *
-                              dist.broadcast(Eigen::DSizes<int, 1>(
-                                  in_t->numel() / label_dim));
-      } else {
-        out.device(dev) = static_cast<T>(1 - epsilon) * in +
-                          static_cast<T>(epsilon / label_dim);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_in_t->mutable_data<T>(ctx.GetPlace());
-    auto d_out_dim = d_out_t->dims()[d_out_t->dims().size() - 1];
-    if (d_out_dim != 0) {
-      auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-      auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
-
-      auto epsilon = ctx.Attr<float>("epsilon");
-      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-      d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc
index af519cc9090b06c789d88d2ef3a2e2d6ba61495b..c24b896e0a49ae5b5c7717a9173d862633fb7cca 100644
--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ b/paddle/fluid/operators/label_smooth_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/label_smooth_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/label_smooth_op_xpu.cc b/paddle/fluid/operators/label_smooth_op_xpu.cc
index 6b6350753909f0dc319d07904b4d81327262684e..dd8d0c721c9c29242ba06d3bc57b51da04ff69f5 100644
--- a/paddle/fluid/operators/label_smooth_op_xpu.cc
+++ b/paddle/fluid/operators/label_smooth_op_xpu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/label_smooth_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h
index 521a35646c45a257b56783c500b239ce74a5de0a..7a161fb9dd38352ce4f0f0b6d1fc92b725cfcc52 100644
--- a/paddle/fluid/operators/load_op.h
+++ b/paddle/fluid/operators/load_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
index d6e2b3ecff8c83e47a9016cc3d233d1aa03fb52b..0e69b397e04c7eda7f515350caf870be5d7b57a5 100644
--- a/paddle/fluid/operators/log_softmax_op.cc
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -31,9 +31,17 @@ class LogSoftmaxOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
 
@@ -48,6 +56,10 @@ class LogSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The dimension index of Input(x) to perform log_softmax,"
                  "default -1 for last dimension")
         .SetDefault(-1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false)
+        .AsExtra();
     AddComment(R"DOC(
 LogSoftmax Operator.
 
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index e36c8b1c1b2531f726cc0e9ec1cde6a7aaac6bb5..29079b8b1385dee3a28c42a178a046fab77e6200 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -164,8 +164,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto gpu_place = context.GetPlace();
 
       // TODO(yuyang18): Strange code here.
-      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+      paddle::framework::MixVector<int64_t> mixv_new_rows(&new_rows);
+      memory::Copy(gpu_place, mixv_new_rows.CUDAMutableData(context.GetPlace()),
                    gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
+      mixv_new_rows.CopyToCPU();
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 42318ca6a8d3e06a8a6560cdf6eef2d67e6116b0..d40b2643785706e843dbd9812e74ca0aa134f7b5 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -21,19 +21,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX,
-          bool PaddingFlag>
+template <typename T, typename IdT, bool PaddingFlag>
 __global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
                               const int64_t N, const int64_t K, const int64_t D,
                               const int64_t padding_idx) {
   int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;
 
   while (idy < K) {
     auto id = static_cast<int64_t>(ids[idy]);
     T *out = output + idy * D;
     const T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
+    for (int i = idx; i < D; i += blockDim.x) {
       if (PaddingFlag) {
         if (id == padding_idx)
           out[i] = static_cast<T>(0);
@@ -43,25 +42,29 @@ __global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
         out[i] = tab[i];
       }
     }
-    idy += BlockDimY * GridDimX;
+    idy += blockDim.y * gridDim.x;
   }
 }
 
-template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX>
+template <typename T, typename IdT>
 __global__ void LookupTableV2Grad(T *table, const T *output, const IdT *ids,
                                   const int64_t N, const int64_t K,
                                   const int64_t D) {
   int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;
 
   while (idy < K) {
     auto id = static_cast<int64_t>(ids[idy]);
     const T *out = output + idy * D;
     T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
+#ifdef PADDLE_WITH_CUDA
+    paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
+#else
+    for (int i = idx; i < D; i += blockDim.x) {
       paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
     }
-    idy += BlockDimY * GridDimX;
+#endif
+    idy += blockDim.y * gridDim.x;
   }
 }
 
@@ -81,8 +84,9 @@ struct LookupTableV2CUDAFunctor {
     size_t D = table_t->dims()[1];
     size_t K = ids_t_->numel();
 
+    const int gridx = 2 * context_.cuda_device_context().GetSMCount();
     dim3 threads(256, 4);
-    dim3 grids(80, 1);
+    dim3 grids(gridx, 1);
 
     const auto *table = table_t->template data<T>();
     const auto *ids = ids_t_->template data<IdT>();
@@ -90,10 +94,10 @@ struct LookupTableV2CUDAFunctor {
     auto stream = context_.cuda_device_context().stream();
 
     if (padding_idx == -1) {
-      LookupTableV2<T, IdT, 256, 4, 80, false><<<grids, threads, 0, stream>>>(
+      LookupTableV2<T, IdT, false><<<grids, threads, 0, stream>>>(
           output, table, ids, N, K, D, padding_idx);
     } else {
-      LookupTableV2<T, IdT, 256, 4, 80, true><<<grids, threads, 0, stream>>>(
+      LookupTableV2<T, IdT, true><<<grids, threads, 0, stream>>>(
           output, table, ids, N, K, D, padding_idx);
     }
   }
@@ -152,14 +156,16 @@ struct LookupTableV2GradCUDAFunctor {
       new_rows.resize(ids_num);
       auto gpu_place = context_.GetPlace();
 
+      paddle::framework::MixVector<int64_t> mixv_new_rows(&new_rows);
       if (!std::is_same<IdT, int64_t>::value) {
         InputTypeConvert<<<grids, threads, 0, stream>>>(
-            ids_data, ids_num, new_rows.MutableData(gpu_place));
+            ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
       } else {
-        memory::Copy(gpu_place, new_rows.CUDAMutableData(gpu_place), gpu_place,
-                     ids_data, ids_num * sizeof(int64_t), stream);
+        memory::Copy(gpu_place, mixv_new_rows.CUDAMutableData(gpu_place),
+                     gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
       }
 
+      mixv_new_rows.CopyToCPU();
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
@@ -191,17 +197,22 @@ struct LookupTableV2GradCUDAFunctor {
       int D = d_table_t->dims()[1];
       int K = ids_t_->numel();
 
-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
       const T *d_output = d_output_t->template data<T>();
       const auto *ids = ids_t_->template data<IdT>();
       T *d_table = d_table_t->mutable_data<T>(context_.GetPlace());
 
-      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#endif
 
-      LookupTableV2Grad<T, IdT, 128, 8,
-                        8><<<grids, threads, 0, dev_ctx.stream()>>>(
+      const int gridx = 2 * dev_ctx.GetSMCount();
+      dim3 threads(128, 8);
+      dim3 grids(gridx, 1);
+      LookupTableV2Grad<T, IdT><<<grids, threads, 0, dev_ctx.stream()>>>(
           d_table, d_output, ids, N, K, D);
     }
   }
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index 49f4ff3107026000726738a640d635739023bc62..f323e2e041d994eb01c9d4e934984b8a005ffcec 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -88,8 +88,8 @@ void SetValueCompute(const framework::ExecutionContext& ctx,
   // set_value is what we want.
   paddle::framework::TensorCopy(*in, place, out);
 
-  Tensor slice_tensor(framework::TransToPtenDataType(dtype)),
-      pad_tensor(framework::TransToPtenDataType(dtype));
+  Tensor slice_tensor(framework::TransToPhiDataType(dtype)),
+      pad_tensor(framework::TransToPhiDataType(dtype));
   slice_tensor.mutable_data<T>(slice_dims, place);
   pad_tensor.mutable_data<T>(in_dims, place);
 
@@ -147,7 +147,7 @@ void SetValueCompute(const framework::ExecutionContext& ctx,
     ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
         ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
   } else {
-    Tensor value_t(framework::TransToPtenDataType(dtype));
+    Tensor value_t(framework::TransToPhiDataType(dtype));
     auto value_dims = phi::make_ddim(shape);
     CheckIsDimsMatch(slice_dims_for_assign, value_dims);
 
@@ -224,8 +224,8 @@ void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   phi::AddRawKernel<
-      T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
-      static_cast<const typename paddle::framework::ConvertToPtenContext<
+      T, typename paddle::framework::ConvertToPhiContext<DeviceContext>::TYPE>(
+      static_cast<const typename paddle::framework::ConvertToPhiContext<
           DeviceContext>::TYPE&>(dev_ctx),
       src1, src2, -1, out);
 }
@@ -237,8 +237,8 @@ void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   phi::SubtractRawKernel<
-      T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
-      static_cast<const typename paddle::framework::ConvertToPtenContext<
+      T, typename paddle::framework::ConvertToPhiContext<DeviceContext>::TYPE>(
+      static_cast<const typename paddle::framework::ConvertToPhiContext<
           DeviceContext>::TYPE&>(dev_ctx),
       src1, src2, -1, out);
 }
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index c6405f65ee3dd21bf0c993bff93f491b706b32f4..a2e34d98461e0107f27d51d3ce7a618c34ca7ea3 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -26,6 +26,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -246,8 +247,8 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     const auto& labels_dims = labels->dims();
 
     const int axis = logits_dims.size() - 1;
-    const int N = SizeToAxis(axis, logits_dims);
-    const int D = SizeFromAxis(axis, logits_dims);
+    const int N = phi::funcs::SizeToAxis(axis, logits_dims);
+    const int D = phi::funcs::SizeFromAxis(axis, logits_dims);
 
     int blocks = NumBlocks(N);
     int threads = kNumCUDAThreads;
@@ -401,8 +402,8 @@ class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 
     const auto sofrmax_dims = softmax->dims();
     const int axis = sofrmax_dims.size() - 1;
-    const int N = SizeToAxis(axis, sofrmax_dims);
-    const int D = SizeFromAxis(axis, sofrmax_dims);
+    const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims);
+    const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims);
 
     if (return_softmax) {
       framework::TensorCopy(*softmax, context.GetPlace(),
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.h b/paddle/fluid/operators/margin_cross_entropy_op.h
index fe0dab5d47d35a56e1806ecb2c47e9cfc8197cd0..9261c84c8552c3eb6b441a28324859970eb0a0b3 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.h
+++ b/paddle/fluid/operators/margin_cross_entropy_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/softmax_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index d5336af8f05ef7fce1d5b1a2153cb8928772e232..ac6566a87030d4c9cf613134cfe85c379fea5e20 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,9 +6,9 @@ endif()
 
 # please add new math_library in alphabetical order
 if (WITH_ASCEND_CL)
-math_library(concat_and_split DEPS npu_op_runner)
+math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
 else()
-math_library(concat_and_split)
+math_library(concat_and_split DEPS concat_and_split_functor)
 endif()
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
@@ -42,6 +42,7 @@ endif()
 math_library(fc DEPS blas jit_kernel_helper)
 math_library(matrix_bit_code)
 
+
 math_library(unpooling)
 math_library(vol2col)
 math_library(prelu)
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index c954bdf81d30d13abc8383544e17709ee249cc99..486979aa0a8b3009d09f73de54f9b7b3ac8a77ad 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -357,8 +357,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
     framework::LoD selected_lod(2);
     selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
     selected_lod[1].resize(scores->dims()[0] + 1);
-    size_t* selected_offsets =
-        selected_lod[1].CUDAMutableData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&selected_lod[1]);
+    paddle::framework::MixVector<size_t> mixv_abs(&abs_lod[level]);
+    size_t* selected_offsets = mix_vector.CUDAMutableData(context.GetPlace());
 
     if (num_seqs == 1) {
       const int seq_length = static_cast<int>(abs_lod[level][1]);
@@ -377,7 +378,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
                 is_accumulated, num_used_threads));
       }
     } else if (num_seqs <= 4) {
-      const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace());
+      const size_t* seq_offsets = mixv_abs.CUDAData(context.GetPlace());
       // Use only 1 block
       const int kMaxThreadsPerSeq = 32;
       const int kMaxSeqs = 4;
@@ -400,6 +401,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
     }
 
     context.Wait();
+    mix_vector.CopyToCPU();
     if (!framework::CheckLoD(selected_lod)) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "lod %s is not right in"
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 8ec89f1b60acebdb0d1da8b6a07113b1f4c23ef0..46126ac59c892787d2f63956983404843e518ae7 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
 
-#include "paddle/phi/kernels/cpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
@@ -46,9 +46,8 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const std::vector<framework::Tensor>& input, int axis,
                   framework::Tensor* output) {
-    std::vector<phi::DenseTensor> pt_input{input.begin(), input.end()};
-    phi::ConcatImpl<T, platform::CPUDeviceContext>(context, pt_input, axis,
-                                                   output);
+    phi::funcs::ConcatFunctor<phi::CPUContext, T> functor;
+    functor(context, input, axis, output);
   }
 };
 
@@ -63,11 +62,8 @@ class SplitFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
-    std::vector<const phi::DenseTensor*> pt_ref_inputs{ref_inputs.begin(),
-                                                       ref_inputs.end()};
-    std::vector<phi::DenseTensor*> pt_outputs{outputs->begin(), outputs->end()};
-    phi::SplitImpl<T, platform::CPUDeviceContext>(context, input, pt_ref_inputs,
-                                                  axis, &pt_outputs);
+    phi::funcs::SplitFunctor<phi::CPUContext, T> functor;
+    functor(context, input, ref_inputs, axis, outputs);
   }
 };
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 51f94afcfc1b99755d5f9dca8460a56fc76cf543..e51631385eb75a63083e0cbbd2a8632d689be8f1 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
 
-#include "paddle/phi/kernels/gpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -29,10 +29,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const std::vector<framework::Tensor>& input, int axis,
                   framework::Tensor* output) {
-    std::vector<phi::DenseTensor> pt_input{input.begin(), input.end()};
-
-    phi::ConcatImpl<T, platform::CUDADeviceContext>(context, pt_input, axis,
-                                                    output);
+    phi::funcs::ConcatFunctor<phi::GPUContext, T> functor;
+    functor(context, input, axis, output);
   }
 };
 
@@ -43,16 +41,12 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
 template <typename T>
 class SplitFunctor<platform::CUDADeviceContext, T> {
  public:
-  SplitFunctor();
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   int axis, std::vector<framework::Tensor*>* outputs) {
-    std::vector<const phi::DenseTensor*> pt_ref_inputs{ref_inputs.begin(),
-                                                       ref_inputs.end()};
-    std::vector<phi::DenseTensor*> pt_outputs{outputs->begin(), outputs->end()};
-    phi::SplitImpl<T, platform::CUDADeviceContext>(
-        context, input, pt_ref_inputs, axis, &pt_outputs);
+    phi::funcs::SplitFunctor<phi::GPUContext, T> functor;
+    functor(context, input, ref_inputs, axis, outputs);
   }
 };
 
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index 65d2ca79e60c2ec90d879ce9818c398adc93c73c..b5b0aae23ac875c7afeb4148309138aae49e5b4a 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -64,17 +64,3 @@ class SplitFunctor {
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
-
-#define FOR_ALL_TYPES(macro)                 \
-  macro(int);                                \
-  macro(float);                              \
-  macro(double);                             \
-  macro(bool);                               \
-  macro(int64_t);                            \
-  macro(int16_t);                            \
-  macro(uint8_t);                            \
-  macro(int8_t);                             \
-  macro(::paddle::platform::float16);        \
-  macro(::paddle::platform::bfloat16);       \
-  macro(::paddle::platform::complex<float>); \
-  macro(::paddle::platform::complex<double>);
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 8efd35ca108100e4d224890846433433702c57a9..8fc6c52122abfe48d87a14ae274849a18c020546 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -22,6 +22,10 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+namespace phi {
+class CPUContext;
+}  // namespace phi
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -31,12 +35,12 @@ namespace math {
  * col =
  *   [input_channels, filter_height, filter_width, output_height, output_width]
  */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUDeviceContext, T> {
+template <class T, typename DeviceContext>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
+                    T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
@@ -73,12 +77,11 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
  * col =
  *   [input_channels, filter_height, filter_width, output_height, output_width]
  */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUDeviceContext, T> {
+template <class T, typename DeviceContext>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
+                    T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
@@ -155,22 +158,30 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              platform::CPUDeviceContext, double>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             phi::CPUContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             phi::CPUContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                              platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                              platform::CPUDeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             phi::CPUContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             phi::CPUContext, double>;
 
 /*
  * im = [input_channels, input_height, input_width]
  * col =
  *   [output_height, output_width, input_channels, filter_height, filter_width]
  */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUDeviceContext, T> {
+template <class T, typename DeviceContext>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
+                    T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
@@ -235,12 +246,11 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
  * col =
  *   [output_height, output_width, input_channels, filter_height, filter_width]
  */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUDeviceContext, T> {
+template <class T, typename DeviceContext>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
+                    T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
@@ -316,11 +326,18 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              platform::CPUDeviceContext, double>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             phi::CPUContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             phi::CPUContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                              platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                              platform::CPUDeviceContext, double>;
-
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             phi::CPUContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             phi::CPUContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 67165ff2219891e3518673845ce224a30b117ff8..fcd5c06a6f310f8a23608a77f2d6b9098e99b33a 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -170,7 +170,8 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     auto* in2_value = input2->mutable_value();
 
     // concat rows
-    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+    paddle::framework::MixVector<int64_t> mixv_in2_rows(&in2_rows);
+    mixv_in2_rows.Extend(in1_rows.begin(), in1_rows.end());
 
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index ea0b0bb29548bef0792d00f177d6789daf211ad6..8563d8b05b186c025ecc4c970a400765adeb0c5d 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -161,9 +161,10 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
+    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
     SelectedRowsAddTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
+        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
         in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
@@ -198,8 +199,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto* in2_value = input2->mutable_value();
 
     // concat rows
+    paddle::framework::MixVector<int64_t> mixv_in2_rows(&in2_rows);
     if (in1_rows.size()) {
-      in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+      mixv_in2_rows.Extend(in1_rows.begin(), in1_rows.end());
     }
 
     auto in1_place = input1.place();
@@ -274,9 +276,10 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
+    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
     SelectedRowsAddToTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
+        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
         in1_row_numel);
   }
 };
@@ -356,10 +359,13 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid1(input_rows.size(), 1);
 
+    paddle::framework::MixVector<int64_t> mix_vector_input(&input_rows);
+    paddle::framework::MixVector<int64_t> mix_vector_out(out.mutable_rows());
     MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
-        input_data, input_rows.CUDAData(context.GetPlace()), out_data,
-        out.mutable_rows()->CUDAMutableData(context.GetPlace()),
-        out.rows().size(), input_width);
+        input_data, mix_vector_input.CUDAData(context.GetPlace()), out_data,
+        mix_vector_out.CUDAMutableData(context.GetPlace()), out.rows().size(),
+        input_width);
+    mix_vector_out.CopyToCPU();
   }
 
   void operator()(const platform::CUDADeviceContext& context,
@@ -423,10 +429,13 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
       auto& input_rows = input->rows();
       dim3 grid1(input_rows.size(), 1);
 
+      paddle::framework::MixVector<int64_t> mix_vector_input(&input_rows);
+      paddle::framework::MixVector<int64_t> mix_vector_out(out.mutable_rows());
       MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
-          input_data, input_rows.CUDAData(context.GetPlace()), out_data,
-          out.mutable_rows()->CUDAMutableData(context.GetPlace()),
-          out.rows().size(), input_width);
+          input_data, mix_vector_input.CUDAData(context.GetPlace()), out_data,
+          mix_vector_out.CUDAMutableData(context.GetPlace()), out.rows().size(),
+          input_width);
+      mix_vector_out.CopyToCPU();
     }
   }
 };
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
index cd1ca572689bc701da801384e5ed08fe6dc10749..f56c5293971bce3b43e86686e828fad4c90639f5 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -72,8 +72,9 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     dim3 threads(128, 8);
     dim3 grid(8, 1);
     auto stream = context.stream();
+    paddle::framework::MixVector<size_t> mix_index_lod(&index_lod);
     CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
-        src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height,
+        src_data, dst_data, mix_index_lod.CUDAData(context.GetPlace()), height,
         width, is_src_index);
   }
 };
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 65bf77f0d152b99059eea2ba98b5d2f0945dc273..01fd2d403c4564ba022e3ab9633fa04d998dd662 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -59,7 +59,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                   int lod_level = 0, bool norm_by_times = false,
                   const PadLayout layout = kBatchLengthWidth) {
     auto seq_lod = seq_tensor.lod();
-    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
+    auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
     const auto& seq_tensor_dims = seq_tensor.dims();
     const auto& pad_tensor_dims = pad_tensor->dims();
     int max_seq_len = MaximumSequenceLength(seq_offsets);
@@ -104,10 +104,11 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     T* pad_data = pad_tensor->data<T>();
     const T* pad_value_data = pad_value.data<T>();
 
+    paddle::framework::MixVector<size_t> mix_vector_seq_offsets(&seq_offsets);
     SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
         pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
-        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
-        step_width, norm_by_times, layout);
+        mix_vector_seq_offsets.CUDAData(context.GetPlace()), seq_num,
+        pad_seq_len, step_width, norm_by_times, layout);
   }
 };
 
@@ -157,9 +158,10 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     const T* pad_data = pad_tensor.data<T>();
     T* seq_data = seq_tensor->data<T>();
 
+    paddle::framework::MixVector<size_t> mixv_seq_offsets(&seq_offsets);
     SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
         seq_data, pad_data, nullptr, false,
-        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
+        mixv_seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
         step_width, norm_by_times, layout);
   }
 };
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 1c09acf52fae3f911b3c5e46855c9343a88ffae8..fa7b043153851460c9c8d5586ddce88872b7e3c7 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -168,41 +168,42 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
     const size_t item_dim = output->numel() / output->dims()[0];
     dim3 threads(1024, 1);
     dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
+    paddle::framework::MixVector<size_t> mix_vector(&lod);
     if (pooltype == "MAX") {
       sequence_pool_kernel<
           T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           MaxPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), index->data<int>());
     } else if (pooltype == "AVERAGE") {
       sequence_pool_kernel<
           T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           AvgPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SUM") {
       sequence_pool_kernel<
           T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           SumPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SQRT") {
       sequence_pool_kernel<
           T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           SqrtPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "LAST") {
       sequence_pool_kernel<
           T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           LastPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "FIRST") {
       sequence_pool_kernel<
           T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           FirstPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -335,41 +336,42 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
     const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
     dim3 threads(1024, 1);
     dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
+    paddle::framework::MixVector<size_t> mix_vector(&lod);
     if (pooltype == "MAX") {
       sequence_pool_grad_kernel<
           T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           MaxPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
     } else if (pooltype == "AVERAGE") {
       sequence_pool_grad_kernel<
           T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           AvgPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SUM") {
       sequence_pool_grad_kernel<
           T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           SumPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SQRT") {
       sequence_pool_grad_kernel<
           T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "LAST") {
       sequence_pool_grad_kernel<
           T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           LastPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "FIRST") {
       sequence_pool_grad_kernel<
           T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           FirstPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
 
     } else {
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index 1807c77e37ca16967d24c423a1bebac779f59ce5..8e02d1b70ff83b3641d498567a236ffcb41bb988 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -41,21 +41,23 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
     auto lod = seq->lod();
     const size_t num_seq = lod[level].size() - 1;
     const size_t seq_width = seq->numel() / seq->dims()[0];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    auto abs_offset_lod = framework::ToAbsOffset(lod);
     T* seq_data = seq->mutable_data<T>(context.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&(abs_offset_lod[level]));
 
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(
         HIP_KERNEL_NAME(SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>),
         dim3(num_seq), dim3(PADDLE_CUDA_NUM_THREADS), 0, context.stream(),
-        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
-        scales, seq_width);
+        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+        seq_width);
 #else
     SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
-        scales, seq_width);
+        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+        seq_width);
 #endif
+    mix_vector.CopyToCPU();
   }
 };
 
diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
index fa2018178f44ff4e3b14937c1f508fa8a698e20e..c855cb763a97b24222c77f064f80fbc2a50e1f9f 100644
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,6 +27,13 @@ template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
 
+template class SoftmaxFunctor<phi::CPUContext, float, true>;
+template class SoftmaxFunctor<phi::CPUContext, float, false>;
+template class SoftmaxFunctor<phi::CPUContext, double, true>;
+template class SoftmaxFunctor<phi::CPUContext, double, false>;
+template class SoftmaxGradFunctor<phi::CPUContext, float>;
+template class SoftmaxGradFunctor<phi::CPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 692a077f1050fff064e89ef4fad2633972af2d9c..fd879e9e6ffe72a2175acc2db98727f5ff39fbbb 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -139,6 +140,16 @@ template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,
                                   platform::float16>;
 
+template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
+template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
+template class SoftmaxFunctor<phi::GPUContext, float, false>;
+template class SoftmaxFunctor<phi::GPUContext, double, false>;
+template class SoftmaxFunctor<phi::GPUContext, float, true>;
+template class SoftmaxFunctor<phi::GPUContext, double, true>;
+template class SoftmaxGradFunctor<phi::GPUContext, float>;
+template class SoftmaxGradFunctor<phi::GPUContext, double>;
+template class SoftmaxGradFunctor<phi::GPUContext, platform::float16>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
index 1f87513bb4bea0208bc8de945aa56ffed198ab61..2598d3b0277c94a52e1fa14b04c00b595071f312 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -35,8 +35,8 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     }
 
     // cast `indices` or `label` if their type is not INT32
-    Tensor indices_int32(framework::TransToPtenDataType(VT::INT32));
-    Tensor label_int32(framework::TransToPtenDataType(VT::INT32));
+    Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
+    Tensor label_int32(framework::TransToPhiDataType(VT::INT32));
     auto indices_type = framework::TransToProtoVarType(indices->type());
     if (indices_type != VT::INT32) {
       PADDLE_ENFORCE_EQ(MLUSupportsCast(indices_type, VT::INT32), true,
@@ -78,7 +78,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     // equal
     MLUCnnlTensorDesc indices_int32_desc(indices_int32);
     MLUCnnlTensorDesc label_int32_desc(label_int32);
-    Tensor equal_tensor(framework::TransToPtenDataType(VT::BOOL));
+    Tensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
     equal_tensor.Resize(indices->dims());
     equal_tensor.mutable_data<bool>(ctx.GetPlace());
     MLUCnnlTensorDesc equal_tensor_desc(equal_tensor);
@@ -88,7 +88,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                    GetBasePtr(&equal_tensor));
 
     // cast equal
-    Tensor equal_fp32(framework::TransToPtenDataType(VT::FP32));
+    Tensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
     equal_fp32.Resize(indices->dims());
     equal_fp32.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc equal_fp32_desc(equal_fp32);
@@ -99,7 +99,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
 
     // [correct]
     // reduce_max
-    Tensor correct_max(framework::TransToPtenDataType(VT::FP32));
+    Tensor correct_max(framework::TransToPhiDataType(VT::FP32));
     correct_max.Resize(phi::make_ddim({num_samples}));
     correct_max.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc correct_max_desc(correct_max);
@@ -112,7 +112,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                     correct_max_desc.get(), GetBasePtr(&correct_max));
 
     // reduce_sum
-    Tensor correct_sum(framework::TransToPtenDataType(VT::FP32));
+    Tensor correct_sum(framework::TransToPhiDataType(VT::FP32));
     correct_sum.Resize(correct->dims());
     correct_sum.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc correct_sum_desc(correct_sum);
@@ -138,7 +138,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     MLUCnnl::Fill(ctx, num_samples, total_desc.get(), GetBasePtr(total));
 
     // use `total` of type `float32` for calculating accuracy
-    Tensor total_fp32(framework::TransToPtenDataType(VT::FP32));
+    Tensor total_fp32(framework::TransToPhiDataType(VT::FP32));
     total_fp32.Resize(total->dims());
     total_fp32.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc total_fp32_desc(total_fp32);
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index bc13321473b88fd89a635259e6c4e8c4c113cc1b..e8c80096dd88bf9542794a850f08be931b221e81 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -262,6 +262,10 @@ using EluMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_elu>;
 template <typename T>
 using ExpMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_exp>;
 
+template <typename T>
+using RoundMKLDNNFunctor =
+    MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_round>;
+
 template <typename T>
 using ReluMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_relu>;
@@ -330,6 +334,10 @@ namespace ops = paddle::operators;
       ops::MKLDNNActivationGradKernel<                                        \
           ops::grad_functor<paddle::platform::bfloat16>>);
 
+#define REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(act_type, functor) \
+  REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace,  \
+                     ops::MKLDNNActivationKernel<ops::functor<float>>);
+
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                            \
   __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor);              \
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);           \
@@ -341,6 +349,8 @@ namespace ops = paddle::operators;
   __macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
+REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(round, RoundMKLDNNFunctor);
+
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor,
                                        ReluMKLDNNGradFunctor);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 33ea36d24b8aef833890277fd69ed02e4859802f..04b90d2f1f380a72dd076774f2b68c2d1bc7e55b 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -53,17 +53,13 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
   std::vector<int> ComputeOutputShape(
       const framework::ExecutionContext& ctx) const {
     const auto* x = ctx.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const bool is_channel_last = false;  // In mkldnn kernel, always use NCHW
-
-    framework::DDim in_dhw_dims;
-    if (is_channel_last) {  // NDHWC, NHWC, NWC
-      in_dhw_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {  // NCDHW, NCHW, NCW
-      in_dhw_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
+    const auto& in_dims = x->dims();
+
+    const framework::DDim in_dhw_dims =
+        phi::slice_ddim(in_dims, 2, in_dims.size());
 
     std::vector<int> out_dims;
+    out_dims.reserve(5);
     if (in_dhw_dims.size() == 1) {
       out_dims.push_back(ctx.Attr<int>("out_w"));
     } else if (in_dhw_dims.size() == 2) {
@@ -125,12 +121,8 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
                              "out_d, out_h, out_w of Op(interpolate) "
                              "should be greater than 0."));
 
-    out_dims.insert(out_dims.begin(), in_dims[0]);
-    if (is_channel_last) {
-      out_dims.push_back(in_dims[in_dims.size() - 1]);
-    } else {
-      out_dims.insert(out_dims.begin() + 1, in_dims[1]);
-    }
+    const std::vector<int64_t> nc_dims = {in_dims[0], in_dims[1]};
+    out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end());
     return out_dims;
   }
 
@@ -143,12 +135,12 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
     const auto* x = ctx.Input<Tensor>("X");
     auto* z = ctx.Output<Tensor>("Out");
 
-    auto interp_method = ctx.Attr<std::string>("interp_method");
-    dnnl::algorithm algo = (interp_method == "nearest")
-                               ? dnnl::algorithm::resampling_nearest
-                               : dnnl::algorithm::resampling_linear;
+    const auto interp_method = ctx.Attr<std::string>("interp_method");
+    const dnnl::algorithm algo = (interp_method == "nearest")
+                                     ? dnnl::algorithm::resampling_nearest
+                                     : dnnl::algorithm::resampling_linear;
 
-    auto out_dims_vec = ComputeOutputShape(ctx);
+    const auto out_dims_vec = ComputeOutputShape(ctx);
     framework::DDim dim_out = phi::make_ddim(out_dims_vec);
     z->Resize(dim_out);
 
@@ -162,6 +154,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
     const std::unordered_map<int, dnnl::memory> args = {
         {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
     resampling_prim->execute(astream, args);
     astream.wait();
 
@@ -184,6 +177,7 @@ REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace,
 
 REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>,
+                   ops::InterpolateMKLDNNKernel<paddle::platform::bfloat16>,
                    ops::InterpolateMKLDNNKernel<int8_t>,
                    ops::InterpolateMKLDNNKernel<uint8_t>);
 REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
diff --git a/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..626d3ef40b16655af7b45092cf388a92d62c349d
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+class LogSoftmaxMKLDNNHandler
+    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward> {
+ public:
+  LogSoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine,
+                          platform::Place cpu_place, const Tensor* x,
+                          const int axis)
+      : platform::MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward>(
+            mkldnn_engine, cpu_place) {
+    const auto logsoftmax_tz = phi::vectorize(x->dims());
+    const auto md = dnnl::memory::desc(
+        logsoftmax_tz, platform::MKLDNNGetDataType<T>(), x->format());
+
+    this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_inference,
+                                            md, axis);
+  }
+};
+
+template <typename T>
+class LogSoftmaxMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    Tensor* out = ctx.Output<Tensor>("Out");
+
+    int axis = ctx.Attr<int>("axis");
+    axis = axis >= 0 ? axis : x->dims().size() + axis;
+
+    LogSoftmaxMKLDNNHandler<T> handler(mkldnn_engine, ctx.GetPlace(), x, axis);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto dst_memory_p = handler.AcquireDstMemory(out);
+
+    auto logsoftmax_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    logsoftmax_p->execute(astream, {{DNNL_ARG_SRC, *src_memory_p},
+                                    {DNNL_ARG_DST, *dst_memory_p}});
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(x->format());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(log_softmax, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::LogSoftmaxMKLDNNKernel<float>,
+                   ops::LogSoftmaxMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 2effcbf9f46dd52d9f2dde03a08bcda7a5247e1a..a0e50aa297851b1c6129b169e01b6fa43c1c326c 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -70,7 +71,8 @@ class SoftmaxMKLDNNHandler
                           out_grad->dims(), in_x_grad->dims()));
 
     auto dims = out_grad->dims();  // input and output share the same shape
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
+    const int axis =
+        phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
     auto softmax_tz = phi::vectorize<int64_t>(dims);
 
     auto data_softmax_md = MKLDNNMemDesc(
@@ -96,7 +98,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     Tensor* output = ctx.Output<Tensor>("Out");
     bool is_inplaced = input->IsSharedBufferWith(*output);
 
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), input->dims().size());
+    const int axis =
+        phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), input->dims().size());
 
     SoftmaxMKLDNNHandler<T> handler(mkldnn_engine, ctx.GetPlace(), input,
                                     output, axis);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index 9c5bad86278ed0f47ebb5ebd4ede19b714ec8120..2fdeecf89346fcf15f38b291ed5af49b8a2c8fc0 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -31,7 +31,7 @@ USE_OP(elementwise_mul);
 USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 USE_OP(conv2d);
 USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index 92c58ae0a77679147f76c5113e07f300f4cf2ba2..c776cf2a7c792c429fcf45a367d3f06bf9add5d2 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -29,7 +29,7 @@ USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 056e0690c01fdb1e7a9726db6905c05c7dc1eb54..2cbecba9fa081970221242555b6b805ff9acae83 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -85,7 +85,7 @@ inline cnnlDataType_t ToCnnlDataType(
 
 inline cnnlDataType_t ToCnnlDataType(
     const paddle::framework::proto::VarType::Type& type) {
-  return ToCnnlDataType(framework::TransToPtenDataType(type));
+  return ToCnnlDataType(framework::TransToPhiDataType(type));
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc
index 02479222747df9824cf8d0eacddd89a74a8ea28e..1143f9cb37aa54bea430d3a8bca8b62b02da4e2b 100644
--- a/paddle/fluid/operators/multinomial_op.cc
+++ b/paddle/fluid/operators/multinomial_op.cc
@@ -11,16 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/multinomial_op.h"
 
 #include <algorithm>
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -46,61 +46,6 @@ This OP returns a Tensor filled with the sampled categoris according to Multinom
 class MultinomialOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Multinomial");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multinomial");
-
-    auto x_dim = ctx->GetInputDim("X");
-    int64_t x_rank = x_dim.size();
-    PADDLE_ENFORCE_GT(x_rank, 0,
-                      platform::errors::InvalidArgument(
-                          "The number of dimensions of the input probability "
-                          "distribution should be > 0, but got %d.",
-                          x_rank));
-    PADDLE_ENFORCE_LE(x_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "The number of dimensions of the input probability "
-                          "distribution should be <= 2, but got %d.",
-                          x_rank));
-
-    std::vector<int64_t> out_dims(x_rank);
-    for (int64_t i = 0; i < x_rank - 1; i++) {
-      out_dims[i] = x_dim[i];
-    }
-
-    int64_t num_samples = ctx->Attrs().Get<int>("num_samples");
-    PADDLE_ENFORCE_GT(
-        num_samples, 0,
-        platform::errors::InvalidArgument(
-            "The number of samples should be > 0, but got %d.", num_samples));
-    out_dims[x_rank - 1] = num_samples;
-
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-  }
-};
-
-template <typename T>
-class MultinomialOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    const int64_t num_samples = ctx.Attr<int>("num_samples");
-    const bool replacement = ctx.Attr<bool>("replacement");
-
-    auto *in_data = x->data<T>();
-    int64_t *out_data = out->mutable_data<int64_t>(ctx.GetPlace());
-
-    auto in_dims = x->dims();
-    int64_t in_rank = in_dims.size();
-    const int64_t num_categories = in_dims[in_rank - 1];
-    const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
-
-    MultinomialFunctor<T>(out_data, in_data, num_samples, replacement,
-                          num_categories, num_distributions);
-  }
 };
 
 }  // namespace operators
@@ -108,11 +53,10 @@ class MultinomialOpKernel<platform::CPUDeviceContext, T>
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
+                            PT_INFER_META(phi::MultinomialInferMeta));
 REGISTER_OPERATOR(
     multinomial, ops::MultinomialOp, ops::MultinomialOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    multinomial, ops::MultinomialOpKernel<plat::CPUDeviceContext, float>,
-    ops::MultinomialOpKernel<plat::CPUDeviceContext, double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    MultinomialInferShapeFunctor);
diff --git a/paddle/fluid/operators/multinomial_op.cu b/paddle/fluid/operators/multinomial_op.cu
deleted file mode 100644
index a07cae8d3dabc98d22ff2423a605915e8260a802..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/multinomial_op.cu
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// To-do(qili93): fix this after issue resolved
-// https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202
-
-#include <thrust/execution_policy.h>
-#include <thrust/random.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/multinomial_op.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void NormalizeProbability(T* norm_probs, const T* in_data,
-                                     T* sum_rows, int64_t num_distributions,
-                                     int64_t num_categories) {
-  int id = threadIdx.x + blockIdx.x * blockDim.x +
-           blockIdx.y * gridDim.x * blockDim.x;
-  if (id < num_distributions * num_categories) {
-    PADDLE_ENFORCE(
-        in_data[id] >= 0.0,
-        "The input of multinomial distribution should be >= 0, but got %f.",
-        in_data[id]);
-    int64_t row_id = id / num_categories;
-    PADDLE_ENFORCE(sum_rows[row_id] > 0.0,
-                   "The sum of one multinomial distribution probability should "
-                   "be > 0, but got %f.",
-                   sum_rows[row_id]);
-    norm_probs[id] = in_data[id] / sum_rows[row_id];
-  }
-}
-
-template <typename T>
-__global__ void GetCumulativeProbs(T* norm_probs_data,
-                                   int64_t num_distributions,
-                                   int64_t num_categories,
-                                   T* cumulative_probs) {
-  int id = blockIdx.x;
-  thrust::inclusive_scan(thrust::device, norm_probs_data + id * num_categories,
-                         norm_probs_data + (id + 1) * num_categories,
-                         cumulative_probs + id * num_categories);
-}
-
-template <typename T>
-struct RandomGeneratorCudaFunctor {
-  unsigned int seed_;
-  __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
-    rng.discard(n);
-    return dist(rng);
-  }
-};
-
-template <typename T>
-__device__ int binarySearchFunctor(T* cumulative_probs, T* norm_probs_data,
-                                   int num_categories, T rng_number) {
-  int left = 0;
-  int right = num_categories;
-
-  while (right - left > 0) {
-    int mid = left + (right - left) / 2;
-
-    T temp_prob = cumulative_probs[mid];
-    if (temp_prob < rng_number) {
-      left = mid + 1;
-    } else {
-      right = mid;
-    }
-  }
-
-  if (left == num_categories) {
-    left = num_categories - 1;
-  }
-
-  while (left >= 1 && norm_probs_data[left] == 0) left--;
-
-  return left;
-}
-
-template <typename T>
-__global__ void sampleMultinomialWithReplacement(
-    T* rng_data, const int64_t num_samples, int64_t* out_data,
-    const int64_t num_distributions, const int64_t num_categories,
-    T* cumulative_probs, T* norm_probs_data) {
-  // use binary search to get the selected category sample id.
-  // let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
-
-  // for every distribution
-  int dist = blockIdx.y;
-  // for every sample
-  int sample = blockIdx.x * blockDim.x + threadIdx.x;
-  if (sample < num_samples) {
-    T rng_number = rng_data[sample + dist * num_samples];
-
-    // Find the bucket that a uniform random number lies in
-    int selected_category = binarySearchFunctor<T>(
-        cumulative_probs + dist * num_categories,
-        norm_probs_data + dist * num_categories, num_categories, rng_number);
-
-    out_data[sample + dist * num_samples] = selected_category;
-  }
-}
-
-template <typename T>
-class MultinomialOpKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    const int64_t num_samples = ctx.Attr<int>("num_samples");
-    const bool replacement = ctx.Attr<bool>("replacement");
-
-    auto* in_data = x->data<T>();
-    int64_t* out_data = out->mutable_data<int64_t>(ctx.GetPlace());
-
-    auto in_dims = x->dims();
-    int64_t in_rank = in_dims.size();
-    const int64_t num_categories = in_dims[in_rank - 1];
-    const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
-
-    // If replacement is False, it's not a replaceable sample. Every category
-    // can
-    // be used only once. So after every sample, probability of the distribution
-    // will change. The implementation can't be parallelizable. Thus, call CPU
-    // implementation ``MultinomialFunctor`` to sample the distribution.
-    if (!replacement) {
-      int64_t in_data_numel = x->numel();
-      int64_t out_data_numel = out->numel();
-
-      T* cpu_in_data = new T[in_data_numel];
-      int64_t* cpu_out_data = new int64_t[out_data_numel];
-
-#ifdef PADDLE_WITH_HIP
-      hipMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T),
-                hipMemcpyDeviceToHost);
-#else
-      cudaMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T),
-                 cudaMemcpyDeviceToHost);
-#endif
-
-      MultinomialFunctor<T>(cpu_out_data, cpu_in_data, num_samples, replacement,
-                            num_categories, num_distributions);
-
-#ifdef PADDLE_WITH_HIP
-      hipMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(int64_t),
-                hipMemcpyHostToDevice);
-#else
-      cudaMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(int64_t),
-                 cudaMemcpyHostToDevice);
-#endif
-
-      delete[] cpu_in_data;
-      delete[] cpu_out_data;
-      return;
-    }
-
-    // Sum of input may not be 1. To get probability in range [0, 1], calculate
-    // sum of each row of input, and then use the sum to normalize the input.
-    // sum_row_data: sum of each row
-    framework::Tensor sum_rows_tensor;
-    auto* sum_rows_data =
-        sum_rows_tensor.mutable_data<T>({num_distributions}, ctx.GetPlace());
-
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-
-    if (num_distributions == 1) {
-      auto eigen_input = framework::EigenVector<T>::Flatten(*x);
-      auto eigen_sum_rows = framework::EigenVector<T>::Flatten(sum_rows_tensor);
-      eigen_sum_rows.device(place) =
-          eigen_input.sum(Eigen::DSizes<int, 1>(1))
-              .eval()
-              .reshape(Eigen::DSizes<int, 1>(sum_rows_tensor.dims()[0]));
-    } else {
-      auto eigen_input = framework::EigenMatrix<T>::From(*x);
-      auto eigen_sum_rows = framework::EigenVector<T>::Flatten(sum_rows_tensor);
-      eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes<int, 1>(1));
-    }
-
-    // Normalize row of each distribution to get the probability in range [0,
-    // 1].
-    // norm_probs_data: probability of the distribution
-    framework::Tensor norm_probs_tensor;
-    auto* norm_probs_data = norm_probs_tensor.mutable_data<T>(
-        {num_distributions, num_categories}, ctx.GetPlace());
-
-    // number of threads in a block is min(num_categories, 512)
-    dim3 block_norm(num_categories < 512 ? num_categories : 512);
-    dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
-    NormalizeProbability<
-        T><<<grid_norm, block_norm, 0, ctx.cuda_device_context().stream()>>>(
-        norm_probs_data, in_data, sum_rows_data, num_distributions,
-        num_categories);
-
-    // Get cumulative probability of each distribution. It's the same function
-    // of
-    // ``cumsum`` op.
-    framework::Tensor cumulative_probs_tensor;
-    auto* cumulative_probs = cumulative_probs_tensor.mutable_data<T>(
-        {num_distributions, num_categories}, ctx.GetPlace());
-    dim3 block_cumsum(1);
-    dim3 grid_cumsum(num_distributions);
-    GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0,
-                            ctx.cuda_device_context().stream()>>>(
-        norm_probs_data, num_distributions, num_categories, cumulative_probs);
-
-    // Generate random number for each sample.
-    std::random_device rd;
-    auto seed = rd();
-
-    framework::Tensor rng_data_tensor;
-    auto* rng_data = rng_data_tensor.mutable_data<T>(
-        {num_distributions, num_samples}, ctx.GetPlace());
-
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    platform::Transform<platform::CUDADeviceContext> trans;
-    auto* context =
-        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
-    trans(*context, index_sequence_begin,
-          index_sequence_begin + num_distributions * num_samples, rng_data,
-          RandomGeneratorCudaFunctor<T>(seed));
-
-    // Sample the multinomial distributions.
-    dim3 block_sample(128);
-    dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions);
-    sampleMultinomialWithReplacement<T><<<grid_sample, block_sample, 0,
-                                          ctx.cuda_device_context().stream()>>>(
-        rng_data, num_samples, out_data, num_distributions, num_categories,
-        cumulative_probs, norm_probs_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    multinomial, ops::MultinomialOpKernel<plat::CUDADeviceContext, double>,
-    ops::MultinomialOpKernel<plat::CUDADeviceContext, float>);
-
-#endif
diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
index 01135bab6d1d26337eb1bc53867506eae906eea5..ab9f10070fc60deab8974ae0e81e2b4c6cef2ffd 100644
--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -12,7 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/mv_op.h"
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace operators {
 
@@ -116,10 +122,3 @@ REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
                   ops::MVOpGradMaker<paddle::framework::OpDesc>,
                   ops::MVOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(mv_grad, ops::MVOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    mv, ops::MVKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MVKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    mv_grad, ops::MVGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MVGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu
deleted file mode 100644
index b8b61ae49047216b94bbaa35a120b551e1aea91b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mv_op.cu
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mv_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout,
-                                   const T *vec, T *dx) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < m * n; idx += blockDim.x * gridDim.x) {
-    int i = idx / n;
-    int j = idx % n;
-    dx[idx] = dout[i] * vec[j];
-  }
-}
-
-// Using dimensional constraints on matrix multiplication, it is
-// straight-forward to check the following table for when X and Y
-// are both matrices.
-//
-// dX = | dOut Vec^T
-// dVec = | X^T dOut
-template <typename T>
-class MVGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<framework::Tensor>("X");
-    auto *vec = context.Input<framework::Tensor>("Vec");
-    auto *dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *dvec =
-        context.Output<framework::Tensor>(framework::GradVarName("Vec"));
-
-    auto dim_x = x->dims();
-    int m = dim_x[0];
-    int n = dim_x[1];
-
-    // get data ptr
-    const T *x_data = x->data<T>();
-    const T *vec_data = vec->data<T>();
-    const T *dout_data = dout->data<T>();
-
-    auto &dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
-    auto stream = context.cuda_device_context().stream();
-    auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
-
-    if (dx) {
-      T *dx_data = dx->mutable_data<T>(context.GetPlace());
-
-      MVGradDxCUDAKernel<
-          T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-          m, n, dout_data, vec_data, dx_data);
-    }
-
-    if (dvec) {
-      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
-      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-                static_cast<T>(0), dvec_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    mv, ops::MVKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MVKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    mv_grad, ops::MVGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MVGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mv_op.h b/paddle/fluid/operators/mv_op.h
deleted file mode 100644
index c0a2172af3677220ff2816bf0f9b7d8ade0d8ba1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mv_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class MVKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<framework::Tensor>("X");
-    auto *vec = context.Input<framework::Tensor>("Vec");
-
-    auto *out = context.Output<framework::Tensor>("Out");
-
-    auto dim_x = x->dims();
-
-    // get data ptr
-    const T *x_data = x->data<T>();
-    const T *vec_data = vec->data<T>();
-    T *out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    blas.GEMV(false, dim_x[0], dim_x[1], static_cast<T>(1), x_data, vec_data,
-              static_cast<T>(0), out_data);
-  }
-};
-
-// Using dimensional constraints on matrix multiplication, it is
-// straight-forward to check the following table for when X and Y
-// are both matrices.
-//
-// dX = | dOut vec^T
-// dVec = | X^T dOut
-template <typename DeviceContext, typename T>
-class MVGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<framework::Tensor>("X");
-    auto *vec = context.Input<framework::Tensor>("Vec");
-    auto *dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *dvec =
-        context.Output<framework::Tensor>(framework::GradVarName("Vec"));
-
-    auto dim_x = x->dims();
-    int m = dim_x[0];
-    int n = dim_x[1];
-
-    // get data ptr
-    const T *x_data = x->data<T>();
-    const T *vec_data = vec->data<T>();
-    const T *dout_data = dout->data<T>();
-
-    if (dx) {
-      T *dx_data = dx->mutable_data<T>(context.GetPlace());
-
-      for (int i = 0; i < m; ++i) {
-        for (int j = 0; j < n; ++j) {
-          dx_data[i * n + j] = dout_data[i] * vec_data[j];
-        }
-      }
-    }
-
-    if (dvec) {
-      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
-      auto &dev_ctx = context.template device_context<DeviceContext>();
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-                static_cast<T>(0), dvec_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu
index 5bfbc3fd681b8a677e5d512750c69706cc68b2d1..3b8ef9056946a1f84d98621442394dbf3e806576 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -96,12 +96,14 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid2(1, merge_rows.size());
+    paddle::framework::MixVector<int64_t> mixv_merge_rows(&merge_rows);
     SparseAdagradFunctorKernel<
         T, 256><<<grid2, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
                       .stream()>>>(
-        grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr,
-        param_data, moment_data, grad_width, epsilon);
+        grad_merge_data, mixv_merge_rows.CUDAMutableData(context.GetPlace()),
+        lr, param_data, moment_data, grad_width, epsilon);
+    mixv_merge_rows.CopyToCPU();
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 668dd41fa257f28ab819dd811c1002b024372fab..c1aa392d8a528d248d07fb9654e45e3006e79139 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -345,7 +345,10 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       if (beta1_pow->place() == platform::CPUPlace() &&
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 7a04b0bd75a4950c926e7db21e13c70ea20d2bb1..decab04f1ca261a828dd749cefbdbaf9f5cfac79 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -592,7 +592,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       SparseAdamFunctor<T, CPUAdam> functor(
diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu
index abdc61e7fcb46655e3741c1bd7b37a0ec3fd2c7f..1d61bdec26d581278758f39293e600598624435f 100644
--- a/paddle/fluid/operators/optimizers/adamw_op.cu
+++ b/paddle/fluid/operators/optimizers/adamw_op.cu
@@ -368,7 +368,10 @@ class AdamWOpCUDAKernel : public framework::OpKernel<T> {
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       if (beta1_pow->place() == platform::CPUPlace() &&
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
index 3bb605d7f553ea8a72f0c716d4732ac59e984951..3445e9b658becda84aa678e9c1f03b3436d63b70 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -284,6 +284,16 @@ static void CopyVectorToTensor(const std::vector<T> &src,
   memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream);
 }
 
+template <typename T>
+static void CopyVectorToCPUTensor(const std::vector<T> &src,
+                                  framework::Tensor *dst) {
+  dst->Resize({static_cast<int64_t>(src.size())});
+  T *dst_ptr = dst->mutable_data<T>(platform::CPUPlace());
+  const T *src_ptr = src.data();
+  auto nbytes = src.size() * sizeof(T);
+  std::memcpy(dst_ptr, src_ptr, nbytes);
+}
+
 template <typename T>
 class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -677,14 +687,14 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
                                            lengths.back());
     }
 
-    CopyVectorToTensor(
+    CopyVectorToCPUTensor(numel_offsets,
+                          ctx.Output<framework::Tensor>("FusedParamOffsets"));
+    CopyVectorToCPUTensor(
         fp32_partial_numel_offsets,
-        ctx.Output<framework::Tensor>("FP32ShardFusedParamOffsets"), place,
-        stream);
-    CopyVectorToTensor(
+        ctx.Output<framework::Tensor>("FP32ShardFusedParamOffsets"));
+    CopyVectorToCPUTensor(
         fp16_partial_numel_offsets,
-        ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"), place,
-        stream);
+        ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));
 
     // Fill the weight decay tensor
     PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(),
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index 748f8206adbc7fec0b14ca5a72206004951f682c..e5b27446eb330aeb08e134332a5366c6c6ed2908 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -33,12 +33,7 @@ class DistributedFusedLambOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "ParamInfo") {
-      return expected_kernel_type;
-    } else {
-      return framework::OperatorWithKernel::GetKernelTypeForVar(
-          var_name, tensor, expected_kernel_type);
-    }
+    return expected_kernel_type;
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index aeecea8a8e0c155eaf58f6e846c3d681dbc94c93..3f90140f77282983f42ef03f736c35960239dd75 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -14,8 +14,10 @@
 
 #include <cmath>
 #include "paddle/fluid/memory/buffer.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/cast_with_ptr.h"
 #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
+#include "paddle/fluid/operators/optimizers/multi_tensor_apply.h"
 #include "paddle/fluid/operators/tensor_to_string.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/collective_helper.h"
@@ -40,6 +42,163 @@ namespace operators {
 template <typename T>
 using MasterT = typename details::MPTypeTrait<T>::Type;
 
+template <typename T>
+static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
+  static_assert(!std::is_same<T, void>::value, "T cannot be void.");
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream));
+#endif
+}
+
+template <typename T, int BlockDim, int VecSize>
+struct L2NormFunctor {
+  DEVICE void operator()(int tensor_id, int chunk_id, int offset, int size,
+                         const T *x, MasterT<T> *y, int max_chunk_num) const {
+    using MT = MasterT<T>;
+    const T *ptr = x + offset;
+
+    using BlockReduce = cub::BlockReduce<MT, BlockDim>;
+    __shared__ typename BlockReduce::TempStorage storage;
+
+    MT square_sum = static_cast<MT>(0);
+    int i;
+    for (i = threadIdx.x * VecSize; i + VecSize <= size;
+         i += (BlockDim * VecSize)) {
+      platform::AlignedVector<T, VecSize> tmp_vec;
+      platform::Load(ptr + i, &tmp_vec);
+#pragma unroll
+      for (int j = 0; j < VecSize; ++j) {
+        auto tmp = static_cast<MT>(tmp_vec[j]);
+        square_sum += (tmp * tmp);
+      }
+    }
+
+    for (; i < size; ++i) {
+      auto tmp = static_cast<MT>(ptr[i]);
+      square_sum += (tmp * tmp);
+    }
+
+    square_sum = BlockReduce(storage).Reduce(square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      y[tensor_id * max_chunk_num + chunk_id] = square_sum;
+    }
+  }
+};
+
+template <typename InT, typename OutT, int BlockDim, bool NeedSqrt>
+static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
+    const InT *x, OutT *y, int max_chunk_num) {
+  int tensor_id = blockIdx.x;
+  x += (tensor_id * max_chunk_num);
+  using BlockReduce = cub::BlockReduce<InT, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage storage;
+  InT sum = static_cast<InT>(0);
+  for (int i = threadIdx.x; i < max_chunk_num; i += BlockDim) {
+    sum += x[i];
+  }
+  sum = BlockReduce(storage).Reduce(sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    if (NeedSqrt) {
+      y[blockIdx.x] = static_cast<OutT>(sqrtf(sum));
+    } else {
+      y[blockIdx.x] = static_cast<OutT>(sum);
+    }
+  }
+}
+
+template <typename T>
+static int GetChunkedVecSize(const T *ptr, int chunk_size) {
+  static_assert(!std::is_same<T, void>::value, "T cannot be void.");
+
+  constexpr int max_load_bits = 128;
+  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
+  auto address = reinterpret_cast<uintptr_t>(ptr);
+  constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
+  constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
+  constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
+  if (address % vec8 == 0 && chunk_size % vec8 == 0) {
+    return std::min(8, valid_vec_size);
+  } else if (address % vec4 == 0 && chunk_size % vec4 == 0) {
+    return std::min(4, valid_vec_size);
+  } else if (address % vec2 == 0 && chunk_size % vec2 == 0) {
+    return std::min(2, valid_vec_size);
+  } else {
+    return 1;
+  }
+}
+
+#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \
+  case __vec_size: {                                    \
+    constexpr int kVecSize = __vec_size;                \
+    __VA_ARGS__;                                        \
+    break;                                              \
+  }
+
+#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...)    \
+  do {                                                \
+    switch (__vec_size) {                             \
+      PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \
+      PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \
+      PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \
+      PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \
+    }                                                 \
+  } while (0)
+
+// TODO(zengjinle): which chunk_size is better?
+template <typename InT, typename OutT, bool NeedSqrt = false,
+          int MaxTensorNumPerLaunch = 50, int MaxChunkNumPerLaunch = 680,
+          int BlockDim = 512>
+static void MultiTensorL2Norm(const platform::CUDAPlace &place,
+                              gpuStream_t stream, const InT *x,
+                              const int *offsets, int n, OutT *y,
+                              int chunk_size = 65536) {
+  if (n <= 0) return;
+
+  constexpr int kNumTensor = MaxTensorNumPerLaunch;
+  constexpr int kNumChunk = MaxChunkNumPerLaunch;
+  constexpr int kBlockDim = BlockDim;
+
+  int max_chunk_num = -1;
+  int vec_size = 8;
+  int total_chunk_num = 0;
+  for (int i = 0; i < n; ++i) {
+    vec_size = std::min(
+        vec_size, GetChunkedVecSize(x + offsets[i] - offsets[0], chunk_size));
+    int length = offsets[i + 1] - offsets[i];
+    auto tmp_chunk_num = (length + chunk_size - 1) / chunk_size;
+    max_chunk_num = std::max(max_chunk_num, tmp_chunk_num);
+    total_chunk_num += tmp_chunk_num;
+  }
+
+  VLOG(1) << "MultiTensorL2Norm max_chunk_num = " << max_chunk_num
+          << " , total_chunk_num = " << total_chunk_num
+          << " , tensor_num = " << n;
+
+  using MT = MasterT<InT>;
+  memory::Buffer tmp_out(place);
+  auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
+  FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);
+
+#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL                         \
+  do {                                                              \
+    using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>;       \
+    VLOG(10) << __func__ << " " << typeid(InT).name()               \
+             << " VecSize = " << kVecSize;                          \
+    MultiTensorApply<FunctorT, kBlockDim, kNumTensor, kNumChunk>(   \
+        FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \
+        max_chunk_num);                                             \
+  } while (0)
+
+  PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL);
+#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL
+
+  MultiTensorL2NormReduceAgainCUDAKernel<MT, OutT, kBlockDim,
+                                         NeedSqrt><<<n, kBlockDim, 0, stream>>>(
+      tmp_out_ptr, y, max_chunk_num);
+}
+
 template <int LogLevel>
 static void LogParamAndTrustRatioDivSquareNorm(
     const framework::ExecutionContext &ctx, const float *param_square_norm,
@@ -620,76 +779,6 @@ static void CubDeviceReduce(InputIteratorT d_in, OutputIteratorT d_out,
                                 num_items, reduction_op, init, stream));
 }
 
-template <typename InputIteratorT, typename OutputIteratorT,
-          typename OffsetIteratorT, typename ReductionOp, typename T>
-static void CubDeviceSegmentedReduce(InputIteratorT d_in, OutputIteratorT d_out,
-                                     int num_segments,
-                                     OffsetIteratorT d_begin_offsets,
-                                     OffsetIteratorT d_end_offsets,
-                                     ReductionOp reduction_op, T initial_value,
-                                     gpuStream_t stream,
-                                     memory::Buffer *buffer) {
-  void *d_temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedReduce::Reduce(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments,
-      d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream));
-  d_temp_storage = buffer->Alloc<void>(temp_storage_bytes);
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedReduce::Reduce(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments,
-      d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream));
-}
-
-template <typename T>
-struct AddConstantFunctor {
-  explicit AddConstantFunctor(T bias) : bias_(bias) {}
-
-  T operator()(T x) const { return x + bias_; }
-
- private:
-  T bias_;
-};
-
-template <typename T>
-struct OffsetWithBiasFunctor {
-  OffsetWithBiasFunctor(const T *offset, T bias)
-      : offset_(offset), bias_(bias) {}
-
-  HOSTDEVICE T operator()(T idx) const { return offset_[idx] - bias_; }
-
-  HOSTDEVICE constexpr bool operator==(const OffsetWithBiasFunctor<T> &) const {
-    return true;
-  }
-
- private:
-  const T *offset_;
-  const T bias_;
-};
-
-template <typename T, typename OffsetT>
-static void CubDeviceSegmentedSquareNorm(const T *x, MasterT<T> *y, int n,
-                                         const OffsetT *offset,
-                                         OffsetT init_offset,
-                                         gpuStream_t stream,
-                                         memory::Buffer *buffer) {
-  if (n <= 0) return;
-  cub::TransformInputIterator<MasterT<T>, SquareFunctor<T>, const T *> iter(
-      x, SquareFunctor<T>());
-  if (init_offset == static_cast<OffsetT>(0)) {
-    CubDeviceSegmentedReduce(iter, y, n, offset, offset + 1, cub::Sum(),
-                             static_cast<MasterT<T>>(0), stream, buffer);
-  } else {
-    cub::CountingInputIterator<OffsetT> cnt_iter(0);
-    OffsetWithBiasFunctor<OffsetT> functor(offset, init_offset);
-    cub::TransformInputIterator<OffsetT, OffsetWithBiasFunctor<OffsetT>,
-                                cub::CountingInputIterator<OffsetT>>
-        offset_iter(cnt_iter, functor);
-    CubDeviceSegmentedReduce(iter, y, n, offset_iter, offset_iter + 1,
-                             cub::Sum(), static_cast<MasterT<T>>(0), stream,
-                             buffer);
-  }
-}
-
 template <typename T>
 static void GetSquareGradNormImpl(const T *grad, int n, float *square_norm,
                                   gpuStream_t stream,
@@ -862,16 +951,6 @@ static void CheckHasNanInfGrad(const float *fp32_grad, int fp32_numel,
   }
 }
 
-template <typename T>
-static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
-  static_assert(!std::is_same<T, void>::value, "T cannot be void.");
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream));
-#endif
-}
-
 template <typename T>
 class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -1191,13 +1270,16 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         fp16_partial_fused_offsets_t->data<int>();
 
     VLOG(1) << "FusedParamOffsets: "
-            << FlattenToString(fused_offsets, fused_offsets_t->numel(), place);
+            << FlattenToString(fused_offsets, fused_offsets_t->numel(),
+                               fused_offsets_t->place());
     VLOG(1) << "FP32ShardFusedParamOffsets: "
             << FlattenToString(fp32_partial_fused_offsets,
-                               fp32_partial_fused_offsets_t->numel(), place);
+                               fp32_partial_fused_offsets_t->numel(),
+                               fp32_partial_fused_offsets_t->place());
     VLOG(1) << "FP16ShardFusedParamOffsets: "
             << FlattenToString(fp16_partial_fused_offsets,
-                               fp16_partial_fused_offsets_t->numel(), place);
+                               fp16_partial_fused_offsets_t->numel(),
+                               fp16_partial_fused_offsets_t->place());
 
     if (num_devices > 1) {
       if (use_master_param_norm) {
@@ -1207,32 +1289,26 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         FillZeroWithPtr(trust_ratio_div_square_norm, param_num, stream);
       }
     }
-    CubDeviceSegmentedSquareNorm(fp32_param, param_square_norm,
-                                 fp32_global_param_num, fused_offsets, 0,
-                                 stream, &cub_tmp_buffer);
+    MultiTensorL2Norm(place, stream, fp32_param, fused_offsets,
+                      fp32_global_param_num, param_square_norm);
     if (use_master_param_norm) {
-      CubDeviceSegmentedSquareNorm(
-          master_param + fp16_offset, param_square_norm + fp16_local_start_idx,
-          fp16_local_param_num, fp16_partial_fused_offsets, 0, stream,
-          &cub_tmp_buffer);
+      MultiTensorL2Norm(place, stream, master_param + fp16_offset,
+                        fp16_partial_fused_offsets, fp16_local_param_num,
+                        param_square_norm + fp16_local_start_idx);
     } else {
       // NOTE: extra computation is performed. We can improve this performance
       // if needed in the future.
-      CubDeviceSegmentedSquareNorm(
-          fp16_param, param_square_norm + fp32_global_param_num,
-          fp16_global_param_num, fused_offsets + fp32_global_param_num,
-          static_cast<int>(fp32_numel), stream, &cub_tmp_buffer);
+      MultiTensorL2Norm(
+          place, stream, fp16_param, fused_offsets + fp32_global_param_num,
+          fp16_global_param_num, param_square_norm + fp32_global_param_num);
     }
 
-    CubDeviceSegmentedSquareNorm(
-        trust_ratio_div, trust_ratio_div_square_norm + fp32_local_start_idx,
-        fp32_local_param_num, fp32_partial_fused_offsets, 0, stream,
-        &cub_tmp_buffer);
-    CubDeviceSegmentedSquareNorm(
-        trust_ratio_div + fp32_numel_each_device,
-        trust_ratio_div_square_norm + fp16_local_start_idx,
-        fp16_local_param_num, fp16_partial_fused_offsets, 0, stream,
-        &cub_tmp_buffer);
+    MultiTensorL2Norm(place, stream, trust_ratio_div,
+                      fp32_partial_fused_offsets, fp32_local_param_num,
+                      trust_ratio_div_square_norm + fp32_local_start_idx);
+    MultiTensorL2Norm(place, stream, trust_ratio_div + fp32_numel_each_device,
+                      fp16_partial_fused_offsets, fp16_local_param_num,
+                      trust_ratio_div_square_norm + fp16_local_start_idx);
 
     VLOG(1) << "TrustRatioDiv L2-Norm before allreduce: "
             << FlattenToString(trust_ratio_div_square_norm, param_num, place);
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index b74009120abc48feb8b4da0256eac96b1e9b1698..596ed05df3ffd740958bc123582139464722ac23 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -189,7 +189,9 @@ class FTRLOpKernel : public framework::OpKernel<T> {
       merge_func(ctx.template device_context<DeviceContext>(), *grad,
                  merged_grad);
 
-      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
+      auto* merged_rows = merged_grad->mutable_rows();
+      paddle::framework::MixVector<int64_t> mixv_merged_rows(merged_rows);
+      const int64_t* rows = mixv_merged_rows.Data(ctx.GetPlace());
       auto row_numel = static_cast<int64_t>(merged_grad->value().dims()[1]);
       auto row_height = static_cast<int64_t>(merged_grad->rows().size());
 
diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h
index a2189d2a7ca0eda833e926604affc9d9075b1e75..45acf2b3e48345c6a17c75f8409744776a03b243 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -594,7 +594,10 @@ class LambOpKernel : public framework::OpKernel<T> {
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
       if (platform::is_gpu_place(ctx.GetPlace()) &&
           beta1_pow.place() == platform::CPUPlace() &&
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 0561c18580a3f6098ef3471d1cfaa328e5b31026..e271755b740ce33369348ca6f415af958a43616d 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -561,7 +561,10 @@ class MomentumOpKernel : public framework::OpKernel<T> {
       merge_func(ctx.template device_context<DeviceContext>(), *grad,
                  merged_grad);
 
-      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = merged_grad->mutable_rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       int64_t row_numel =
           merged_grad->value().numel() / merged_grad->rows().size();
       platform::ForRange<DeviceContext> for_range(
diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d8d03c733dae210e8a41a8ad78a258df558b341
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "math.h"  // NOLINT
+
+namespace paddle {
+namespace operators {
+
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch>
+struct TensorMetaList {
+  static constexpr int kTensorNum = MaxTensorNumPerLaunch;
+  static constexpr int kChunkNum = MaxChunkNumPerLaunch;
+
+  static_assert(kTensorNum > 0 && kTensorNum < 256,
+                "kTensorNum must be inside (0, 256).");
+  static_assert(kChunkNum > 0 && kChunkNum < 65536,
+                "kChunkNum must be inside (0, 65536).");
+
+  /**
+   * The tensor numel offset of each tensor.
+   * The offsets[0] would be always 0 in the first launch,
+   * and then offsets[0] >= 0 in the following other launches.
+   * The numel of the i-th tensor would be offsets[i + 1] - offsets[i].
+   */
+  int offsets[kTensorNum + 1];
+
+  /**
+   * The tensor id of each chunk. The tensor_ids[0] is always 0.
+   * Note that tensor_ids would be always in the ascending order.
+   * The actual tensor id is start_tensor_id + tensor_ids[i].
+   *
+   * The reason why we assume that the actual tensor id is
+   * start_tensor_id + tensor_ids[i] is to make tensor_ids to be
+   * a uint8_t array instead of an int array, making sizeof(TensorMetaList)
+   * smaller, so that kChunkNum can be larger.
+   */
+  uint8_t tensor_ids[kChunkNum];
+
+  /**
+   * The chunk id of the chunk inside each tensor. It would be
+   * something like chunk_ids = [0, 1, 2, 0, 0, 1, 2, 3], meaning
+   * that there are 3 tensors and each tensor contains 3, 1 and 4
+   * chunks. Note that chunk_ids[0] is always 0 and the actual
+   * chunk id of the first tensor is always start_chunk_id + chunk_ids[i].
+   *
+   * The reason why we assume that the actual chunk id of the first
+   * tensor is always start_chunk_id + chunk_ids[i] is to make
+   * chunk_ids to be a uint16_t array instead of an int array, making
+   * sizeof(TensorMetaList) smaller, so that kChunkNum can be larger.
+   */
+  uint16_t chunk_ids[kChunkNum];
+
+  /**
+   * The tensor_ids offset.
+   */
+  int start_tensor_id;
+
+  /**
+   * The chunk_ids offset.
+   */
+  int start_chunk_id;
+};
+
+template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename... Args>
+static __global__ void MultiTensorApplyCUDAKernel(
+    Functor functor,
+    TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> meta,
+    int chunk_size, Args... args) {
+  const int block_id = blockIdx.x;
+  const int tensor_id = meta.tensor_ids[block_id];
+  const int chunk_id = static_cast<int>(meta.chunk_ids[block_id]) +
+                       (tensor_id == 0) * meta.start_chunk_id;
+  const int prev_offset = meta.offsets[tensor_id];
+  const int next_offset = meta.offsets[tensor_id + 1];
+  const int ptr_offset = prev_offset + chunk_id * chunk_size;
+  const int size = min(next_offset - ptr_offset, chunk_size);
+
+  functor(tensor_id + meta.start_tensor_id, chunk_id, ptr_offset, size,
+          args...);
+}
+
+template <typename Functor, int BlockDim, int MaxTensorNumPerLaunch,
+          int MaxChunkNumPerLaunch, typename... Args>
+static void MultiTensorApply(Functor functor, gpuStream_t stream,
+                             const int *offsets, int n, int chunk_size,
+                             Args... args) {
+  if (n == 0) return;
+
+  constexpr auto NumTensor = MaxTensorNumPerLaunch;
+  constexpr auto NumChunk = MaxChunkNumPerLaunch;
+  TensorMetaList<NumTensor, NumChunk> metas;
+
+  int tensor_id = 0;
+  int chunk_id = 0;
+  int numel_offset = 0;
+  metas.start_tensor_id = 0;
+  metas.start_chunk_id = 0;
+  for (int i = 0; i < n; ++i) {
+    auto length = offsets[i + 1] - offsets[i];
+    if (tensor_id == 0) {
+      metas.start_tensor_id = i;
+      metas.offsets[0] = numel_offset;
+    }
+    metas.offsets[tensor_id + 1] = metas.offsets[tensor_id] + length;
+    ++tensor_id;
+    numel_offset += length;
+
+    auto chunk_num = (length + chunk_size - 1) / chunk_size;
+    int last_launch_chunk_id = 0;
+    for (int j = 0; j < chunk_num; ++j) {
+      metas.chunk_ids[chunk_id] = j - last_launch_chunk_id;
+      metas.tensor_ids[chunk_id] = tensor_id - 1;
+      ++chunk_id;
+
+      bool tensor_full = (tensor_id == NumTensor && j + 1 == chunk_num);
+      bool block_full = (chunk_id == NumChunk);
+      bool last_chunk = (i + 1 == n && j + 1 == chunk_num);
+
+      if (tensor_full || block_full || last_chunk) {
+        MultiTensorApplyCUDAKernel<Functor, NumTensor,
+                                   NumChunk><<<chunk_id, BlockDim, 0, stream>>>(
+            functor, metas, chunk_size, args...);
+        chunk_id = 0;
+        if (j + 1 == chunk_num) {  // chunk for the current tensor is full
+          metas.start_chunk_id = 0;
+          tensor_id = 0;
+        } else {
+          metas.offsets[0] = metas.offsets[tensor_id - 1];
+          metas.offsets[1] = metas.offsets[tensor_id];
+          metas.start_tensor_id = i;
+          metas.start_chunk_id = j + 1;
+          last_launch_chunk_id = j + 1;
+          tensor_id = 1;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h
index 66c16d8015806982a5cf5b321e3ff019fe14831a..71decd27d0d7822c67ba4a2782c1ec2461e67911 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -227,7 +227,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
       merge_func(dev_ctx, grad, merged_grad);
 
       platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace());
+      auto &grad_merge_rows = merged_grad->rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          &grad_merge_rows);
+      const int64_t *rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
 
       auto &merged_tensor = merged_grad->value();
       int64_t row_count = merged_grad->rows().size();
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index a255f0fed3ce0c7b143de6d75beabe36b08b6d60..3149f5f56ed4964a750f61a354c6cd31a29fc526 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -148,11 +148,11 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
       int thread_x = kThreadsPerBlock;
       int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
       int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
+      paddle::framework::MixVector<int64_t> mixv_in_rows(&in_rows);
       SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
                                ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
-          out_data, in_row_numel, in_rows.size());
+          in_data, mixv_in_rows.CUDAData(ctx.GetPlace()),
+          learning_rate->data<T>(), out_data, in_row_numel, in_rows.size());
 
     } else {
       PADDLE_ENFORCE_EQ(false, true,
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index cb9bbe727de5c014ebfe9ea93f6fe279a897569b..2a127d9ad1db0c1e169fdd1e20a1568b99d228a0 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pixel_shuffle_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -19,56 +22,6 @@ namespace operators {
 class PixelShuffleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::NotFound(
-                          "Input(X) of PixelShuffleOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound(
-                          "Output(Out) of PixelShuffleOp should not be null."));
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input should be a 4-D tensor of format [N, C, H, W] "
-                          "or [N, H, W, C], but got %u.",
-                          input_dims.size()));
-
-    auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
-
-    const std::string data_format =
-        ctx->Attrs().Get<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC");
-
-    if (!channel_last) {
-      PADDLE_ENFORCE_EQ(
-          input_dims[1] % (upscale_factor * upscale_factor), 0,
-          platform::errors::InvalidArgument(
-              "The square of upscale_factor[%u] should divide the "
-              "number of channel[%u]",
-              upscale_factor * upscale_factor, input_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          input_dims[3] % (upscale_factor * upscale_factor), 0,
-          platform::errors::InvalidArgument(
-              "The square of upscale_factor[%u] should divide the "
-              "number of channel[%u]",
-              upscale_factor * upscale_factor, input_dims[3]));
-    }
-    auto output_dims = input_dims;
-    output_dims[0] = input_dims[0];
-    if (!channel_last) {
-      output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
-      output_dims[2] = input_dims[2] * upscale_factor;
-      output_dims[3] = input_dims[3] * upscale_factor;
-    } else {
-      output_dims[1] = input_dims[1] * upscale_factor;
-      output_dims[2] = input_dims[2] * upscale_factor;
-      output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
-    }
-    ctx->SetOutputDim("Out", output_dims);
-  }
 };
 
 class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -171,22 +124,16 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
+                            PT_INFER_META(phi::PixelShuffleInferMeta));
+
 REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
                   ops::PixelShuffleGradMaker<paddle::framework::OpDesc>,
-                  ops::PixelShuffleGradMaker<paddle::imperative::OpBase>);
+                  ops::PixelShuffleGradMaker<paddle::imperative::OpBase>,
+                  PixelShuffleInferShapeFunctor);
 
 REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    pixel_shuffle,
-    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    pixel_shuffle_grad,
-    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
-
 REGISTER_OP_VERSION(pixel_shuffle)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cu b/paddle/fluid/operators/pixel_shuffle_op.cu
deleted file mode 100644
index 6faf91079e1dac00b3516ccde8dc82cec73a79e6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pixel_shuffle_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pixel_shuffle_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    pixel_shuffle, ops::PixelShuffleOpKernel<plat::CUDADeviceContext, float>,
-    ops::PixelShuffleOpKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    pixel_shuffle_grad,
-    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, float>,
-    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h
deleted file mode 100644
index 615bc9772167436aa6aa67e14248a5e853c4350f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PixelShuffleOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int factor = ctx.Attr<int>("upscale_factor");
-
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool channel_last = (data_format == "NHWC");
-
-    auto in_dims = in->dims();
-    auto o_dims = out->dims();
-
-    framework::Tensor t;
-    t.ShareDataWith(*in);
-    if (!channel_last) {
-      t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
-    } else {
-      t.Resize({in_dims[0], in_dims[1], in_dims[2], o_dims[3], factor, factor});
-    }
-    std::vector<int> axis = {0, 1, 4, 2, 5, 3};
-
-    framework::Tensor o;
-    o.ShareDataWith(*out);
-    if (!channel_last) {
-      o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
-    } else {
-      o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]});
-    }
-    phi::funcs::Transpose<DeviceContext, T, 6> trans;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    trans(dev_ctx, t, &o, axis);
-    out->Resize(o_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    int factor = ctx.Attr<int>("upscale_factor");
-
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool channel_last = (data_format == "NHWC");
-
-    auto do_dims = dout->dims();
-    auto dx_dims = dx->dims();
-
-    framework::Tensor t;
-    t.ShareDataWith(*dout);
-    if (!channel_last) {
-      t.Resize(
-          {do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
-    } else {
-      t.Resize(
-          {do_dims[0], dx_dims[1], factor, dx_dims[2], factor, do_dims[3]});
-    }
-    std::vector<int> axis = {0, 1, 3, 5, 2, 4};
-
-    framework::Tensor o;
-    o.ShareDataWith(*dx);
-    if (!channel_last) {
-      o.Resize(
-          {do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
-    } else {
-      o.Resize(
-          {do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor});
-    }
-    phi::funcs::Transpose<DeviceContext, T, 6> trans;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    trans(dev_ctx, t, &o, axis);
-    dx->Resize(dx_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc
index cc4b6e5e0756a0a50dd3f28d6c7056e748c80a87..0cecbf0b9cb027f7032b7b20fb10ef06a79503df 100644
--- a/paddle/fluid/operators/poisson_op.cc
+++ b/paddle/fluid/operators/poisson_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
-
-#include "paddle/fluid/operators/poisson_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,14 +25,6 @@ class PoissonOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PoissonOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "PoissonOp");
-
-    auto dim = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", dim);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -61,29 +55,6 @@ class PoissonOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
   }
 };
 
-template <typename T>
-class PoissonKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-
-    const T *x_data = x->data<T>();
-    T *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int64_t size = x->numel();
-
-    auto gen = framework::DefaultCPUGenerator();
-    auto engine = gen->GetCPUEngine();
-
-    for (int64_t i = 0; i < size; ++i) {
-      std::poisson_distribution<> dist(x_data[i]);
-      out_data[i] = static_cast<T>(dist(*engine));
-    }
-  }
-};
-
 class PoissonGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -116,17 +87,13 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker,
                   ops::PoissonOpInferVarType,
                   ops::PoissonGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PoissonGradOpMaker<paddle::imperative::OpBase>);
+                  ops::PoissonGradOpMaker<paddle::imperative::OpBase>,
+                  PoissonInferShapeFunctor);
 
 REGISTER_OPERATOR(poisson_grad, ops::PoissonGradOp);
-
-REGISTER_OP_CPU_KERNEL(poisson,
-                       ops::PoissonKernel<plat::CPUDeviceContext, float>,
-                       ops::PoissonKernel<plat::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(poisson_grad,
-                       ops::PoissonGradKernel<plat::CPUDeviceContext, float>,
-                       ops::PoissonGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/poisson_op.cu b/paddle/fluid/operators/poisson_op.cu
deleted file mode 100644
index ef2f6d4665554024066f4e843707d6612290340f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/poisson_op.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __NVCC__
-#include <curand_kernel.h>
-#endif
-#ifdef __HIPCC__
-#include <hiprand_kernel.h>
-#endif
-#include "paddle/fluid/operators/poisson_op.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct PoissonCudaFunctor {
- public:
-  PoissonCudaFunctor(const T* in, T* out, unsigned int seed,
-                     unsigned int offset)
-      : in_(in), out_(out), seed_(seed), offset_(offset) {}
-
-  __device__ void operator()(int64_t idx) {
-#ifdef __NVCC__
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed_, idx, offset_, &state);
-    out_[idx] = static_cast<T>(curand_poisson(&state, in_[idx]));
-#elif __HIPCC__
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed_, idx, offset_, &state);
-    out_[idx] = static_cast<T>(hiprand_poisson(&state, in_[idx]));
-#endif
-  }
-
- private:
-  const T* in_;
-  T* out_;
-  const unsigned int seed_;
-  const unsigned int offset_;
-};
-
-template <typename T>
-class PoissonKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto size = x->numel();
-    int64_t device_id = ctx.GetPlace().GetDeviceId();
-
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    auto seed_offset = gen_cuda->IncrementOffset(20);
-    uint64_t seed = seed_offset.first;
-    uint64_t offset = seed_offset.second;
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, size);
-
-    PoissonCudaFunctor<T> functor(x_data, out_data, seed, offset);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(poisson,
-                        ops::PoissonKernel<plat::CUDADeviceContext, float>,
-                        ops::PoissonKernel<plat::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    poisson_grad, ops::PoissonGradKernel<plat::CUDADeviceContext, float>,
-    ops::PoissonGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/poisson_op.h b/paddle/fluid/operators/poisson_op.h
deleted file mode 100644
index 2bcb5244012c7663c413fceaa63a9dbbd78147b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/poisson_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PoissonKernel;
-
-template <typename DeviceContext, typename T>
-class PoissonGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> functor;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    functor(dev_ctx, dx, static_cast<T>(0));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index 980351e12a030760b6793ab665d80db737bfa9d5..c5971632b03ef3811d0e836a306f26f7e9a51eb8 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/randint_op.cc b/paddle/fluid/operators/randint_op.cc
index 09c58cd7d4cda396d60a94b02cc8a705bb3c3b01..548e28716dd9108ffd55463cccf9f91ad3b9a941 100644
--- a/paddle/fluid/operators/randint_op.cc
+++ b/paddle/fluid/operators/randint_op.cc
@@ -24,37 +24,6 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class CPURandintKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<int64_t> new_shape;
-    auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensorList");
-    if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
-      if (ctx.HasInput("ShapeTensor")) {
-        auto* shape_tensor = ctx.Input<framework::Tensor>("ShapeTensor");
-        new_shape = GetNewDataFromShapeTensor(shape_tensor);
-      } else if (list_new_shape_tensor.size() > 0) {
-        new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
-      }
-    }
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    if (!new_shape.empty()) out->Resize(phi::make_ddim(new_shape));
-    T* data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t size = out->numel();
-
-    std::uniform_int_distribution<T> dist(ctx.Attr<int>("low"),
-                                          ctx.Attr<int>("high") - 1);
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-  }
-};
-
 class RandintOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -176,6 +145,3 @@ REGISTER_OPERATOR(
     randint, ops::RandintOp, ops::RandintOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
-
-REGISTER_OP_CPU_KERNEL(randint, ops::CPURandintKernel<int>,
-                       ops::CPURandintKernel<int64_t>)
diff --git a/paddle/fluid/operators/randint_op.cu b/paddle/fluid/operators/randint_op.cu
deleted file mode 100644
index 2f9a8cfd142ec7a3d0175b91bd79f239f654c126..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/randint_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/uniform_random_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GPURandintKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int64_t> new_shape;
-    auto list_new_shape_tensor =
-        context.MultiInput<framework::Tensor>("ShapeTensorList");
-    if (list_new_shape_tensor.size() > 0 || context.HasInput("ShapeTensor")) {
-      if (context.HasInput("ShapeTensor")) {
-        auto* shape_tensor = context.Input<framework::Tensor>("ShapeTensor");
-        new_shape = GetNewDataFromShapeTensor(shape_tensor);
-      } else if (list_new_shape_tensor.size() > 0) {
-        new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
-      }
-    }
-
-    platform::CPUPlace cpu;
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* out = context.Output<framework::LoDTensor>("Out");
-    if (!new_shape.empty()) out->Resize(phi::make_ddim(new_shape));
-    T low = static_cast<T>(context.Attr<int>("low"));
-    T high = static_cast<T>(context.Attr<int>("high")) - 1;
-    framework::LoDTensor tensor;
-    tensor.Resize(out->dims());
-    tensor.mutable_data(cpu, framework::TransToPtenDataType(dtype));
-    T* data = tensor.mutable_data<T>(cpu);
-
-    int64_t size = out->numel();
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-
-    /*
-    std::minstd_rand engine;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-    }
-    engine.seed(seed);
-    */
-
-    std::uniform_int_distribution<> dist(context.Attr<int>("low"),
-                                         context.Attr<int>("high") - 1);
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-
-    if (platform::is_gpu_place(context.GetPlace())) {
-      // Copy tensor to out
-      framework::TensorCopy(tensor, context.GetPlace(), out);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(randint, ops::GPURandintKernel<int>,
-                        ops::GPURandintKernel<int64_t>)
diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc
index bdc2ea0b5bfbbfc45f02d4df3a7cf1dbae25bacf..1b28ab3c133f7d57250e3357b0d732603719ef99 100644
--- a/paddle/fluid/operators/randperm_op.cc
+++ b/paddle/fluid/operators/randperm_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/randperm_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -89,10 +88,3 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::RandpermOpVarTypeInference);
-
-template <typename T>
-using kernel =
-    paddle::operators::RandpermKernel<paddle::platform::CPUDeviceContext, T>;
-
-REGISTER_OP_CPU_KERNEL(randperm, kernel<int64_t>, kernel<int>, kernel<float>,
-                       kernel<double>);
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 6393ff2135d1dcae37b2b9e60775460668bf295a..21c23a7f602a35acf676e97a9134c2c43a73126c 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
@@ -115,7 +116,9 @@ void BufferedReader::ReadAsync(size_t i) {
         platform::CUDAPinnedPlace cuda_pinned_place;
         std::vector<void *> cuda_pinned_ptrs;
         cuda_pinned_ptrs.reserve(cpu.size());
-        platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+        platform::RecordEvent record_event(
+            "BufferedReader:MemoryCopy", platform::TracerEventType::UserDefined,
+            1);
         // NODE(chenweihang): When we use CUDAPinned Memory, we need call
         // cudaHostAlloc, that is a CUDA API, calling CUDA API need load
         // cuda lib into device, it will cost hundreds of MB of GPU memory.
@@ -170,7 +173,9 @@ void BufferedReader::ReadAsync(size_t i) {
             cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
 #endif
 
-        platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+        platform::RecordEvent record_event(
+            "BufferedReader:MemoryCopy", platform::TracerEventType::UserDefined,
+            1);
         for (size_t i = 0; i < cpu.size(); ++i) {
           auto cpu_place = cpu[i].place();
           auto cpu_ptr = cpu[i].data();
@@ -229,7 +234,9 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::NPUEventRecord(events_[i].get(), compute_stream_);
       platform::NPUStreamWaitEvent(stream_.get(), events_[i].get());
 
-      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       for (size_t i = 0; i < cpu.size(); ++i) {
         auto cpu_place = cpu[i].place();
         auto cpu_ptr = cpu[i].data();
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 73bc67287c2780d541d93df620776f2936c6ec86..d406640bff240cc24400e858d5c8b274897e1f98 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
@@ -106,7 +106,8 @@ class ReadOp : public framework::OperatorBase {
     std::vector<framework::LoDTensor> ins;
 
     // For profiling
-    platform::RecordEvent record_event(Type());
+    platform::RecordEvent record_event(
+        Type().c_str(), platform::TracerEventType::UserDefined, 1);
 
     reader->ReadNext(&ins);
     if (ins.empty()) {
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 1174e72a76b1bb5aa744b964e289f0ac9c66596c..1f3691978b577e2023eb4f784f2327752855b9b7 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/real_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,14 +23,6 @@ namespace operators {
 class RealOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Real");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Real");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class RealOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,19 +82,13 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(real, ops::RealOp, ops::RealOpMaker,
                   ops::RealGradOpMaker<::paddle::framework::OpDesc>,
-                  ops::RealGradOpMaker<::paddle::imperative::OpBase>);
+                  ops::RealGradOpMaker<::paddle::imperative::OpBase>,
+                  RealInferShapeFunctor);
 REGISTER_OPERATOR(real_grad, ops::RealGradOp);
-
-REGISTER_OP_CPU_KERNEL(real, ops::RealKernel<paddle::platform::CPUDeviceContext,
-                                             paddle::platform::complex<float>>,
-                       ops::RealKernel<paddle::platform::CPUDeviceContext,
-                                       paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(real_grad,
-                       ops::RealGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex<float>>,
-                       ops::RealGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/real_op.cu b/paddle/fluid/operators/real_op.cu
deleted file mode 100644
index 9bfb2878a6261bb5c69a1fb543e5aa15a87c5a8f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/real_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/real_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(real,
-                        ops::RealKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-                        ops::RealKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(real_grad,
-                        ops::RealGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex<float>>,
-                        ops::RealGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/real_op.h b/paddle/fluid/operators/real_op.h
deleted file mode 100644
index c5a9724e8a3048a27aaadfc5e0c42be4816004bd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/real_op.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class RealKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<phi::funcs::Real<T>>(
-        ctx.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RealGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<phi::funcs::Real<T>>();
-    auto* dx_data = d_x->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::RealToComplexFunctor<T> functor(dout_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e02f0268b5e510ac8262543db58ee98ef20e517
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMaxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    int out_dtype = context.Attr<int>("out_dtype");
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+    auto input_dims = framework::vectorize(input->dims());
+    const auto& input_dim_size = input->dims().size();
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      for (size_t i = 0; i < input_dims.size(); i++) {
+        reduce_dims.push_back(static_cast<int>(i));
+      }
+    } else {
+      for (size_t i = 0; i < dims.size(); ++i) {
+        if (dims[i] < 0) {
+          reduce_dims.push_back(dims[i] + input_dim_size);
+        } else {
+          reduce_dims.push_back(dims[i]);
+        }
+      }
+    }
+
+    auto place = context.GetPlace();
+    framework::Tensor cast_out(input->type());
+    cast_out.Resize(output->dims());
+    cast_out.mutable_data<T>(place);
+
+    auto cast_out_dtype = framework::TransToProtoVarType(input->dtype());
+
+    if (out_dtype != -1) {
+      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
+    }
+    if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) {
+      if (cast_out_dtype == framework::proto::VarType::FP32) {
+        output->mutable_data<float>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
+        output->mutable_data<paddle::platform::float16>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
+        output->mutable_data<int32_t>(place);
+      }
+    } else {
+      output->ShareDataWith(cast_out);
+    }
+
+    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input->dtype()));
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->dtype()));
+
+    MLUCnnlReduceDesc reduction_desc(
+        reduce_dims, CNNL_REDUCE_MAX, ToCnnlDataType<T>(),
+        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                    nullptr, input_desc.get(), GetBasePtr(input),
+                    0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
+                    GetBasePtr(output));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(reduce_max, ops::ReduceMaxMLUKernel<float>,
+                       ops::ReduceMaxMLUKernel<plat::float16>,
+                       ops::ReduceMaxMLUKernel<int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..daf5965fd54628a097ad1d53057ec54b9a5d329a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMinMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    int out_dtype = context.Attr<int>("out_dtype");
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+    auto input_dims = framework::vectorize(input->dims());
+    const auto& input_dim_size = input->dims().size();
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      for (size_t i = 0; i < input_dims.size(); i++) {
+        reduce_dims.push_back(static_cast<int>(i));
+      }
+    } else {
+      for (size_t i = 0; i < dims.size(); ++i) {
+        if (dims[i] < 0) {
+          reduce_dims.push_back(dims[i] + input_dim_size);
+        } else {
+          reduce_dims.push_back(dims[i]);
+        }
+      }
+    }
+
+    auto place = context.GetPlace();
+    framework::Tensor cast_out(input->type());
+    cast_out.Resize(output->dims());
+    cast_out.mutable_data<T>(place);
+
+    auto cast_out_dtype = framework::TransToProtoVarType(input->dtype());
+
+    if (out_dtype != -1) {
+      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
+    }
+    if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) {
+      if (cast_out_dtype == framework::proto::VarType::FP32) {
+        output->mutable_data<float>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
+        output->mutable_data<paddle::platform::float16>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
+        output->mutable_data<int32_t>(place);
+      }
+    } else {
+      output->ShareDataWith(cast_out);
+    }
+
+    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input->dtype()));
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->dtype()));
+
+    MLUCnnlReduceDesc reduction_desc(
+        reduce_dims, CNNL_REDUCE_MIN, ToCnnlDataType<T>(),
+        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                    nullptr, input_desc.get(), GetBasePtr(input),
+                    0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
+                    GetBasePtr(output));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(reduce_min, ops::ReduceMinMLUKernel<float>,
+                       ops::ReduceMinMLUKernel<plat::float16>,
+                       ops::ReduceMinMLUKernel<int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index ca3575f5dea84321e4fb46cbaa5606652ef267d4..65cca94814e88111239aef3559285d6fe321a72d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -257,12 +257,12 @@ class ReduceKernel : public framework::OpKernel<T> {
     std::vector<int64_t> tmp_dims(dims.begin(), dims.end());
 
     // call new kernel
-    phi::Reduce<typename framework::ConvertToPtenContext<DeviceContext>::TYPE,
-                T, Functor>(
-        static_cast<const typename framework::ConvertToPtenContext<
+    phi::Reduce<typename framework::ConvertToPhiContext<DeviceContext>::TYPE, T,
+                Functor>(
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *input, reduce_all, tmp_dims, keep_dim,
-        framework::TransToPtenDataType(cast_out_dtype), output);
+        framework::TransToPhiDataType(cast_out_dtype), output);
   }
 };
 template <typename DeviceContext, typename OutT, typename Functor>
@@ -541,11 +541,12 @@ class ReduceOp : public framework::OperatorWithKernel {
 #endif
 
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) ||
-                            platform::is_npu_place(ctx.GetPlace()),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "float16 can only be used on GPU or NPU place"));
+      PADDLE_ENFORCE_EQ(
+          platform::is_gpu_place(ctx.GetPlace()) ||
+              platform::is_npu_place(ctx.GetPlace()) ||
+              platform::is_mlu_place(ctx.GetPlace()),
+          true, platform::errors::InvalidArgument(
+                    "float16 can only be used on GPU or NPU or MLU place"));
     }
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -683,7 +684,7 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
     const Tensor* input = context.Input<Tensor>("X");
     Tensor* output = context.Output<Tensor>("Out");
     auto out_dtype = context.Attr<int>("out_dtype");
-    auto pt_out_dtype = paddle::framework::TransToPtenDataType(
+    auto pt_out_dtype = paddle::framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
 
@@ -713,7 +714,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
     auto out_dtype = context.Attr<int>("in_dtype");
-    auto pt_out_dtype = framework::TransToPtenDataType(
+    auto pt_out_dtype = framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
     // get reduce_dim and reduce_num for reduce_mean_grad
     int dim_size = in_x->dims().size();
@@ -734,8 +735,8 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     } else {
       d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype());
     }
-    auto pt_d_out = paddle::experimental::MakePtenDenseTensor(new_d_out);
-    auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x);
+    auto pt_d_out = paddle::experimental::MakePhiDenseTensor(new_d_out);
+    auto pt_d_x = paddle::experimental::MakePhiDenseTensor(*d_x);
     if (out_dtype <= 0) {
       pt_out_dtype = d_out->dtype();
     }
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index ddb598f575f6737f7c7d4336eeee866b12c12fb1..8d99a60b12967a55e0cc208c6ae96c0dabb5f473 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 
 // only can include the headers in paddle/phi/api dirs
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
@@ -476,6 +476,21 @@ class Reshape2Op : public ReshapeOp {
              const framework::VariableNameMap &outputs,
              const framework::AttributeMap &attrs)
       : ReshapeOp(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(XShape) of ReshapeOp should not be null."));
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+
+    ReshapeOp::InferShape(ctx);
+  }
 };
 
 class Reshape2OpMaker : public ReshapeOpMaker {
@@ -636,13 +651,10 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel);
 
-DELCARE_INFER_SHAPE_FUNCTOR(reshape2, ReshapeInferShapeFunctor,
-                            PT_INFER_META(phi::ReshapeWithXShapeInferMeta));
-
 REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
                   ops::Reshape2GradMaker<paddle::framework::OpDesc>,
                   ops::Reshape2GradMaker<paddle::imperative::OpBase>,
-                  ReshapeInferShapeFunctor, ops::ReshapeOpInplaceInferer);
+                  ops::ReshapeOpInplaceInferer);
 REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp,
                   ops::Reshape2DoubleGradMaker<paddle::framework::OpDesc>,
                   ops::Reshape2DoubleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 7764e52c2f6da1b401b01292969c4d3d04555933..09d2d906653e8c71ddeca7fa606cf5adac8cc596 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -32,6 +32,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
 
     auto in_dims = in->dims();
     int batch_size = in_dims[0];
@@ -117,7 +118,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
         dev_ctx.x_context(), in->data<T>(),
         out->mutable_data<T>(ctx.GetPlace()), rois->data<T>(), roi_id_data,
         batch_size, channels, height, width, out->dims()[0], pooled_height,
-        pooled_width, spatial_scale, sampling_ratio, true);
+        pooled_width, spatial_scale, sampling_ratio, true, aligned);
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
                       platform::errors::External(
                           "The roi_align XPU OP return wrong value[%d %s]", r,
@@ -143,6 +144,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
 
     int rois_num = rois->dims()[0];
     int channels = in->dims()[1];
@@ -197,7 +199,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel<T> {
           dev_ctx.x_context(), out_grad->data<T>(), in_grad->data<T>(),
           rois->data<T>(), roi_id_data, in->dims()[0], channels, height, width,
           out_grad->dims()[0], pooled_height, pooled_width, spatial_scale,
-          sampling_ratio, true);
+          sampling_ratio, true, aligned);
       PADDLE_ENFORCE_EQ(
           r, xpu::Error_t::SUCCESS,
           platform::errors::External(
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index 3def7875232e814b817a7957ab9db65ea611dcf6..c5794948aaec6b47396cbae66a962058812aba11 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -336,7 +336,8 @@ class RowConvKernel<platform::CUDADeviceContext, T>
 
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&batch_indices);
+    size_t *idx = mix_vector.CUDAMutableData(context.GetPlace());
     auto stream = context.cuda_device_context().stream();
 
     if (future_context <= 32) {
@@ -352,6 +353,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
       RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
           in, weight, num_sequence, input_dim, future_context, idx, out);
     }
+    mix_vector.CopyToCPU();
   }
 };
 
@@ -392,7 +394,8 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
     // int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_batch_indices(&batch_indices);
+    size_t *idx = mixv_batch_indices.CUDAMutableData(context.GetPlace());
 
     auto &device_ctx = context.cuda_device_context();
     phi::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
@@ -444,6 +447,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
             dout, weights, num_sequence, input_dim, future_context, idx, din);
       }
     }
+    mixv_batch_indices.CopyToCPU();
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index d6e8f3e5aa1086900d0144ea8757a05776b9c9b0..40f5699a29b355864652b5d899d1918ec663cf0b 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -42,9 +42,9 @@ class ScaleXPUKernel : public framework::OpKernel<T> {
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
     out->mutable_data<T>(in->place());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // call pten kernel
+    // call phi kernel
     phi::ScaleKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *in, scale, bias, bias_after_scale, out);
   }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index 8092a40d19b195828c3742854e9b3656424feee7..9591f3e8b5bbfe70cb059b621eaca0ae1fff993e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -71,7 +71,8 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
     out->Resize({in_dims[0], win_size});
     auto out_data = out->mutable_data<T>(context.GetPlace());
     // Copy LoD to GPU
-    const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_lod0(&lod0);
+    const size_t* dev_in_lod_ptr = mixv_lod0.CUDAData(context.GetPlace());
     // Calc output tensor
     CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                  PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
index bb928cf401c3307b76160387e5108264cd5dbb89..12d3eee65da70edd3f360d448360bb59d2f1069f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -88,7 +88,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
     // Copy LoD to GPU
     auto last_lod = lod[lod.size() - 1];
     auto lod_len = last_lod.size();
-    const size_t* dev_in_lod_ptr = last_lod.CUDAData(ctx.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_last_lod(&last_lod);
+    const size_t* dev_in_lod_ptr = mixv_last_lod.CUDAData(ctx.GetPlace());
     // Calc output LoD
     thrust::device_vector<size_t> dev_out_lod(lod_len);
     size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index f13849fda41769af12aabf93be748e3ce2ad806b..7e1a06b9eca5b9046d2b772edee0efdb1a69437f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -81,8 +81,9 @@ struct SequenceExpandAsFunctor<platform::CUDADeviceContext, T> {
 
     dim3 block_size(thread_x);
     dim3 grid_size(block_x);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
     sequence_expand_as_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), height, width,
+        x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height, width,
         out->mutable_data<T>(context.GetPlace()));
   }
 };
@@ -107,10 +108,11 @@ struct SequenceExpandAsGradFunctor<platform::CUDADeviceContext, T> {
 
     dim3 block_size(thread_x);
     dim3 grid_size(block_x);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
     sequence_expand_as_grad_kernel<<<grid_size, block_size, 0,
                                      context.stream()>>>(
-        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()), height, width,
-        dx->mutable_data<T>(context.GetPlace()));
+        dout.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
+        width, dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index cbf5df001707592e03b315b357e3a5d484068011..7b7bc5183bf1f6c98ef386150fcfa4d048e73f01 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -157,7 +157,9 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
         out_offset[2 * x_lod_size + i] = ref_lod[i];
       }
 
-      const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
+      paddle::framework::MixVector<size_t> mixv_out_offset(&out_offset);
+      const size_t* out_offset_data =
+          mixv_out_offset.CUDAData(context.GetPlace());
       const size_t* x_lod_data = out_offset_data + x_lod_size;
       const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
 
@@ -193,11 +195,14 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
     int block_x = static_cast<int>(ref_lod.size());
     dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
+    paddle::framework::MixVector<size_t> mixv_x_lod(&x_lod);
+    paddle::framework::MixVector<size_t> mixv_out_offset(&out_offset);
     sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        x_lod.CUDAData(context.GetPlace()),
-        out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
-        dx->mutable_data<T>(context.GetPlace()));
+        dout.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()),
+        mixv_x_lod.CUDAData(context.GetPlace()),
+        mixv_out_offset.CUDAData(context.GetPlace()), ref_lod.size(),
+        x_item_length, dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index c42df836de15f5c51caf32e5d0b7b7d8123ff201..90a17d713cf299a3a61169cfc6f16fce7bb5901c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -132,7 +132,9 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
-      lod = x.lod()[0].CUDAData(ctx.GetPlace());
+      auto xlod = x.lod()[0];
+      paddle::framework::MixVector<size_t> mixv_xlod(&xlod);
+      lod = mixv_xlod.CUDAData(ctx.GetPlace());
     } else {
 #endif
       lod = x.lod()[0].data();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index 220165ac1bd4f6a80a2f3c0b21f5423352982588..c91c59dbfee9993711e777668063bec73a3746d8 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -133,9 +133,10 @@ struct SequenceSoftmaxFunctor<platform::CUDADeviceContext, T> {
 
     dim3 block_size(thread_x);
     dim3 grid_size(max_blocks);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
     sequence_softmax_kernel<
         T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), height,
+        x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
         out->mutable_data<T>(context.GetPlace()));
   }
 };
@@ -156,10 +157,12 @@ struct SequenceSoftmaxGradFunctor<platform::CUDADeviceContext, T> {
     dim3 block_size(thread_x);
     dim3 grid_size(max_blocks);
 
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
     sequence_softmax_grad_kernel<
         T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), out.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        height, dx->mutable_data<T>(context.GetPlace()));
+        dout.data<T>(), out.data<T>(),
+        mixv_ref_lod.CUDAData(context.GetPlace()), height,
+        dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index 70733d643673ad8acde9a45f273a52a9723fb0d3..e584c1a4cce1e85344c574526098b034723c3059 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/size_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,13 +23,6 @@ namespace operators {
 class SizeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Size");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Size");
-
-    ctx->SetOutputDim("Out", {1});
-  }
 };
 
 class SizeOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -49,11 +44,10 @@ Return the number of elements in the input.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
+                            PT_INFER_META(phi::SizeInferMeta));
 REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int64_t>,
-                       ops::SizeKernel<paddle::platform::float16>,
-                       ops::SizeKernel<float>, ops::SizeKernel<double>,
-                       ops::SizeKernel<bool>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    SizeInferShapeFunctor);
diff --git a/paddle/fluid/operators/size_op.h b/paddle/fluid/operators/size_op.h
deleted file mode 100644
index 8840fde287d662043d032ec83bc7b7e42ead417d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/size_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class SizeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("Input");
-    auto* out_t = ctx.Output<Tensor>("Out");
-    auto place = ctx.GetPlace();
-    auto out_data = out_t->mutable_data<int64_t>(place);
-    auto cpu_place = platform::CPUPlace();
-    if (place == cpu_place) {
-      out_data[0] = in_t->numel();
-    } else {
-      Tensor cpu_tensor;
-      auto cpu_data =
-          cpu_tensor.mutable_data<int64_t>(out_t->dims(), cpu_place);
-      cpu_data[0] = in_t->numel();
-      paddle::framework::TensorCopy(cpu_tensor, place, out_t);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc
index 5826d2b4a8742b5572e67237139ff8654b2c9e67..95b97025f2969590000e3d336556c0b02ed037de 100644
--- a/paddle/fluid/operators/size_op_npu.cc
+++ b/paddle/fluid/operators/size_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu b/paddle/fluid/operators/softmax_cudnn_op.cu
deleted file mode 100644
index 72c2e97c1782ed2a817241a6d17f5f6f52add4ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_cudnn_op.cu
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, bool LogMode = false>
-class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int input_axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    SoftmaxForwardCUDAKernelDriver<T>(dev_ctx, *x, input_axis, out);
-  }
-};
-
-template <typename T, bool LogMode = false>
-class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    int input_axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx, *out, *dout, input_axis, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxCUDNNKernel<float>,
-                   ops::SoftmaxCUDNNKernel<plat::float16>);
-REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>,
-                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
-#else
-REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxCUDNNKernel<float>,
-                   ops::SoftmaxCUDNNKernel<double>,
-                   ops::SoftmaxCUDNNKernel<plat::float16>);
-REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>,
-                   ops::SoftmaxGradCUDNNKernel<double>,
-                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
-#endif
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index cb97a0bb27cb5c459bf7f2ccd53374759643133f..374992096605bfef0433992193e54306c3a12858 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/softmax_op.h"
-
 #include <memory>
 #include <string>
 #include <unordered_map>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -251,10 +250,3 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
                   ops::SoftmaxOpGradMaker<paddle::imperative::OpBase>,
                   ops::SoftmaxInplaceInferer);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
deleted file mode 100644
index 19359b7eef5126d84f0707d39095a74ae4561186..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
deleted file mode 100644
index 497bbb06dab5f174909684feb0c3bb4546ab3d0e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_op.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline int SizeToAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeFromAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeOutAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis + 1; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename DeviceContext, typename T>
-class SoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<Tensor>("X");
-    auto* Out = context.Output<Tensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = X->dims()[axis];
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-    if (Out->numel() == 0) {
-      return;
-    }
-
-    const int n = SizeToAxis(axis, X->dims());
-    const int d = SizeFromAxis(axis, X->dims());
-    Tensor X_2d, Out_2d;
-    X_2d.ShareDataWith(*X).Resize({n, d});
-    Out_2d.ShareDataWith(*Out).Resize({n, d});
-    math::SoftmaxFunctor<DeviceContext, T, false>()(
-        context.template device_context<DeviceContext>(), axis_dim, &X_2d,
-        &Out_2d);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<Tensor>("Out");
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
-    const int rank = dX->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = dX->dims()[axis];
-
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-    if (dX->numel() == 0) {
-      return;
-    }
-
-    const int n = SizeToAxis(axis, dX->dims());
-    const int d = SizeFromAxis(axis, dX->dims());
-    Tensor dX_2d, Out_2d, dOut_2d;
-    dX_2d.ShareDataWith(*dX).Resize({n, d});
-    Out_2d.ShareDataWith(*Out).Resize({n, d});
-    dOut_2d.ShareDataWith(*dOut).Resize({n, d});
-
-    math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), axis_dim, &Out_2d,
-        &dOut_2d, &dX_2d);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc
index 07e74354bfd7ce930dc0a1b084e668fa2a0983cf..152c8d0a883b09358dc253d65523b30fb59a25b6 100644
--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ b/paddle/fluid/operators/softmax_op_npu.cc
@@ -12,8 +12,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -51,7 +52,7 @@ class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
 
     auto dims = dX->dims();
     const int rank = dims.size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
     int64_t first_dim = 1;
     int64_t sec_dim = 1;
     for (int i = 0; i < axis; i++) {
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index defda1a3b04a62254cc6ccbfe254f739cc31f909..3bc55fafd81e18d0a986268ff4692129c6515edc 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
index a29804e505f66f8ee4bf4eb281886b45963f537c..1ed13c8bd1baea28301814d788af67954ee7932a 100644
--- a/paddle/fluid/operators/softmax_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -29,7 +29,7 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
     auto* x = context.Input<Tensor>("X");
     auto* out = context.Output<Tensor>("Out");
     const int rank = x->dims().size();
-    int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
 
     // allocate memory on device.
     out->mutable_data<T>(context.GetPlace());
@@ -88,7 +88,7 @@ class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
     auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
     const int rank = dx->dims().size();
-    int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
 
     // allocate memory on device.
     dx->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index cba779d0a77d0037c596de1de3b486bf567c05f2..6f0881e9fc98f6c1ce6c7535c9c68a2fe64e2241 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -153,7 +153,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
                           "Attr(axis) value should be in range [-R, R-1], "
                           "R is the rank of Input(Logits)."));
 
-    axis = CanonicalAxis(axis, logits_rank);
+    axis = phi::funcs::CanonicalAxis(axis, logits_rank);
     for (int i = 0; i < logits_rank; i++) {
       if (i != axis) {
         if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
@@ -250,7 +250,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
                           "Attr(axis) value should be in range [-R, R-1], "
                           "R is the rank of Input(Logits)."));
 
-    axis = CanonicalAxis(axis, softmax_rank);
+    axis = phi::funcs::CanonicalAxis(axis, softmax_rank);
     for (int i = 0; i < softmax_rank; i++) {
       if (i != axis) {
         if (ctx->IsRuntime() || (softmax_dims[i] > 0 && labels_dims[i] > 0)) {
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 2bbacef596e5916a7c82c29a83f85e9b5932b2d4..19a395e72314db52d52cf704a567dce8dd58318a 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -17,19 +17,22 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
 namespace operators {
 
+#define ALIGN_BYTES 16
+
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
+namespace kps = phi::kps;
 
 // Wrapper of log function. Use log(float32) for float16
 template <typename T>
@@ -47,6 +50,18 @@ static __device__ __forceinline__ T Exp(T x) {
   return math::TolerableValue<T>()(static_cast<T>(expx));
 }
 
+template <typename Tx, typename Ty = Tx>
+struct ExpAddFunctor {
+  HOSTDEVICE inline ExpAddFunctor(Tx max) : max(max) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& sum, const Tx& x) const {
+    return static_cast<Ty>(sum + std::exp(x - max));
+  }
+
+ private:
+  Tx max;
+};
+
 // log2(value)
 static inline int Log2Ceil(int value) {
   int log2_value = 0;
@@ -236,7 +251,7 @@ __global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src,
       max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
     }
   }
-  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
+  phi::WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
 
   // compute sum: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} }
   AccT sum[kBatchSize];
@@ -276,7 +291,7 @@ __global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src,
       }
     }
   }
-  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
+  phi::WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
 // write data
 #pragma unroll
@@ -419,10 +434,272 @@ void SwitchWarpSoftmaxForward(T* loss, T* softmax, const T* src,
   }
 }
 
+template <typename T, bool IgnoreIndex>
+__device__ __forceinline__ void ComputeLoss(T* loss, const T loss_value,
+                                            const int label_id,
+                                            const int64_t label_value,
+                                            const int tid, const int vec_size,
+                                            const int offset,
+                                            const int ignore_index) {
+  int loss_id = vec_size * tid + offset;
+  if (IgnoreIndex) {
+    if (label_value == loss_id) {
+      if (label_value == ignore_index) {
+        loss[label_id] = static_cast<T>(0.0f);
+      } else {
+        loss[label_id] = loss_value;
+      }
+    }
+  } else {
+    if (label_value == loss_id) {
+      loss[label_id] = loss_value;
+    }
+  }
+}
+
+template <typename T, typename AccT, int VecSize, class ReduceFunctor>
+__device__ __forceinline__ AccT ThreadReduce(const T* input, int size,
+                                             const int offset, AccT init,
+                                             ReduceFunctor reducer) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+  int tid = threadIdx.x;
+  AccT val = init;
+
+  if (offset > 0) {
+    input -= offset;
+    size += offset;
+    if (tid >= offset) {
+      val = reducer(val, input[tid]);
+    }
+    size -= blockDim.x;
+    input += blockDim.x;
+  }
+  int remain = size % (VecSize * blockDim.x);
+
+  T ins[VecSize];
+  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+
+  // vector part
+  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
+    *ins_vec = reinterpret_cast<const VecT*>(input)[tid];
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      val = reducer(val, ins[i]);
+    }
+  }
+
+  // scalar part
+  tid = size - remain + threadIdx.x;
+  for (; tid < size; tid += blockDim.x) {
+    val = reducer(val, input[tid]);
+  }
+  return val;
+}
+
+template <typename T, typename AccT, typename LabelT, int VecSize,
+          bool IgnoreIndex>
+__device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
+    T* loss, T* softmax, const T* logits, const LabelT* label, int size,
+    const int offset, const phi::LogSoftmaxForwardFunctor<AccT>& func,
+    const int ignore_index) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+  int tid = threadIdx.x;
+  int label_id = blockIdx.x;
+  auto label_value = static_cast<int64_t>(label[label_id]);
+  const bool label_valid = label_value >= 0 && label_value < size;
+  int loss_id_offset = 0;
+
+  if (offset > 0) {
+    logits -= offset;
+    softmax -= offset;
+    size += offset;
+    loss_id_offset -= offset;
+    if (tid >= offset) {
+      AccT log_softmax = func(static_cast<AccT>(logits[tid]));
+      softmax[tid] = static_cast<T>(std::exp(log_softmax));
+      // loss
+      if (label_valid) {
+        ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax),
+                                    label_id, label_value, tid, 1,
+                                    loss_id_offset, ignore_index);
+      }
+    }
+    size -= blockDim.x;
+    logits += blockDim.x;
+    softmax += blockDim.x;
+    loss_id_offset += blockDim.x;
+  }
+  int remain = size % (VecSize * blockDim.x);
+
+  T ins[VecSize];
+  T outs[VecSize];
+  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+  VecT* outs_vec = reinterpret_cast<VecT*>(&outs);
+
+  // vector part
+  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
+    // read
+    *ins_vec = reinterpret_cast<const VecT*>(logits)[tid];
+
+#pragma unroll
+    // compute
+    for (int i = 0; i < VecSize; ++i) {
+      AccT log_softmax = func(static_cast<AccT>(ins[i]));
+      outs[i] = static_cast<T>(std::exp(log_softmax));
+
+      // loss
+      if (label_valid) {
+        ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax),
+                                    label_id, label_value, tid, VecSize,
+                                    loss_id_offset + i, ignore_index);
+      }
+    }
+
+    // write
+    reinterpret_cast<VecT*>(softmax)[tid] = *outs_vec;
+  }
+
+  // scalar part
+  tid = size - remain + threadIdx.x;
+  for (; tid < size; tid += blockDim.x) {
+    AccT log_softmax = func(static_cast<AccT>(logits[tid]));
+    softmax[tid] = static_cast<T>(std::exp(log_softmax));
+
+    // loss
+    if (label_valid) {
+      ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax), label_id,
+                                  label_value, tid, 1, loss_id_offset,
+                                  ignore_index);
+    }
+  }
+
+  // invalid label, write once
+  if (!label_valid && threadIdx.x == 0) {
+    loss[label_id] = static_cast<T>(0.0f);
+  }
+}
+
+template <typename T, typename AccT, typename LabelT, int VecSize,
+          bool IgnoreIndex>
+__device__ __forceinline__ void ScalarSoftmaxForwardImpl(
+    T* loss, T* softmax, const T* logits, const LabelT* label, const int size,
+    const phi::LogSoftmaxForwardFunctor<AccT>& func, const int ignore_index) {
+  int tid = threadIdx.x;
+  int remain = size % (VecSize * blockDim.x);
+  int label_id = blockIdx.x;
+  auto label_value = static_cast<int64_t>(label[label_id]);
+  const bool label_valid = label_value >= 0 && label_value < size;
+
+  // main part
+  for (; tid < (size - remain); tid += VecSize * blockDim.x) {
+    T ins[VecSize];
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      ins[i] = logits[tid + i * blockDim.x];
+    }
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      AccT log_softmax = func(static_cast<AccT>(ins[i]));
+      softmax[tid + i * blockDim.x] = static_cast<T>(std::exp(log_softmax));
+      // loss
+      if (label_valid) {
+        ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax),
+                                    label_id, label_value, tid, VecSize, i,
+                                    ignore_index);
+      }
+    }
+  }
+
+  // tail part
+  for (; tid < size; tid += blockDim.x) {
+    AccT log_softmax = func(static_cast<AccT>(logits[tid]));
+    softmax[tid] = static_cast<T>(std::exp(log_softmax));
+    // loss
+    if (label_valid) {
+      ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax), label_id,
+                                  label_value, tid, 1, 0, ignore_index);
+    }
+  }
+
+  // invalid label, write once
+  if (!label_valid && threadIdx.x == 0) {
+    loss[label_id] = static_cast<T>(0.0f);
+  }
+}
+
+template <typename T, typename AccT, typename LabelT, int VecSize,
+          bool IgnoreIndex>
+__global__ void VectorizedSoftmaxForward(T* loss, T* softmax, const T* logits,
+                                         const LabelT* label,
+                                         const int high_dim, const int mid_dim,
+                                         const int ignore_index) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+
+  // each block deal with one batch
+  logits += blockIdx.x * mid_dim;
+  softmax += blockIdx.x * mid_dim;
+
+  const int input_offset = ((uint64_t)logits) % ALIGN_BYTES / sizeof(T);
+  const int output_offset = ((uint64_t)softmax) % ALIGN_BYTES / sizeof(T);
+
+  // 1. reduce max
+  AccT max = ThreadReduce<T, AccT, VecSize, kps::MaxFunctor<AccT>>(
+      logits, mid_dim, input_offset, -std::numeric_limits<AccT>::infinity(),
+      kps::MaxFunctor<AccT>());
+  max = kps::details::BlockXReduce<AccT, kps::MaxFunctor<AccT>>(
+      max, kps::MaxFunctor<AccT>());
+
+  // 2. reduce sum
+  AccT sum = ThreadReduce<T, AccT, VecSize, ExpAddFunctor<AccT>>(
+      logits, mid_dim, input_offset, static_cast<AccT>(0),
+      ExpAddFunctor<AccT>(max));
+  sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      sum, kps::AddFunctor<AccT>());
+
+  // 3. softmax
+  phi::LogSoftmaxForwardFunctor<AccT> func(max, sum);
+  if (input_offset == output_offset) {
+    VectorizedSoftmaxForwardImpl<T, AccT, LabelT, VecSize, IgnoreIndex>(
+        loss, softmax, logits, label, mid_dim, input_offset, func,
+        ignore_index);
+  } else {
+    ScalarSoftmaxForwardImpl<T, AccT, LabelT, VecSize, IgnoreIndex>(
+        loss, softmax, logits, label, mid_dim, func, ignore_index);
+  }
+}
+
+template <typename T, typename LabelT, bool IgnoreIndex>
+void LaunchVectorizedSoftmaxForward(T* loss, T* softmax, const T* logits,
+                                    const LabelT* label, const int high_dim,
+                                    const int mid_dim, const int ignore_index,
+                                    gpuStream_t stream) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  constexpr int vec_size = sizeof(float4) / sizeof(T);
+  const int max_num_threads = 1024;
+  int max_block_size = std::min(mid_dim / vec_size, max_num_threads);
+  if (vec_size > 1) {
+    max_block_size /= 2;
+  }
+
+  int block_size = 1;
+  while (block_size < max_block_size) {
+    block_size *= 2;
+  }
+  block_size = std::max(block_size, kps::details::kWarpSize);
+  dim3 grids(high_dim);
+  dim3 blocks(block_size);
+  VectorizedSoftmaxForward<T, AccT, LabelT, vec_size,
+                           IgnoreIndex><<<grids, blocks, 0, stream>>>(
+      loss, softmax, logits, label, high_dim, mid_dim, ignore_index);
+}
+
 /*
   Wrapper of softmax with cross entropy hard label.
-  - SwitchWarpSoftmaxForward for small size
-  - cudnn function for large size
+  - SwitchWarpSoftmaxForward for small size when axis == -1
+  - LaunchVectorizedSoftmaxForward for large size when axis == -1
+  - cudnn function for axis != -1
 */
 template <typename T, typename LabelT, bool IgnoreIndex>
 static void SoftmaxWithCrossEntropyHardLabel(
@@ -431,11 +708,17 @@ static void SoftmaxWithCrossEntropyHardLabel(
     T* softmax_data, int N, int dim, int D, const int ignore_index) {
   auto stream = ctx.stream();
   constexpr int max_dim = 320;
-  if (D == 1 && dim <= max_dim) {  // small size
-    const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
-    SwitchWarpSoftmaxForward<T, LabelT, mode, IgnoreIndex>(
-        loss_data, softmax_data, logits_data, labels_data, N, dim, dim,
-        ignore_index, stream);
+  if (D == 1) {
+    if (dim <= max_dim) {  // small size
+      const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
+      SwitchWarpSoftmaxForward<T, LabelT, mode, IgnoreIndex>(
+          loss_data, softmax_data, logits_data, labels_data, N, dim, dim,
+          ignore_index, stream);
+    } else {  // large size
+      LaunchVectorizedSoftmaxForward<T, LabelT, IgnoreIndex>(
+          loss_data, softmax_data, logits_data, labels_data, N, dim,
+          ignore_index, stream);
+    }
   } else {
     ScopedTensorDescriptor desc;
     std::vector<int> tensor_dims = {N, dim, D, 1};
@@ -566,7 +849,7 @@ __global__ void CrossEntropySoftLabel(T* loss, T* softmaxwrt, const T* softmax,
       }
     }
   }
-  WarpReduceSum<T, kBatchSize, kWarpSize>(sum);
+  phi::WarpReduceSum<T, kBatchSize, kWarpSize>(sum);
   __syncthreads();
 
   __shared__ T sumshare[kWarpPerBatch][kBatchPerBlock][kBatchSize];
@@ -674,7 +957,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src,
                          : static_cast<AccT>(valmax);
     }
   }
-  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
+  phi::WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
 
   // compute sum
   AccT sum[kBatchSize]{0.0};
@@ -694,7 +977,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src,
       }
     }
   }
-  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
+  phi::WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
   // log_softmax and loss
   AccT sumloss[kBatchSize]{0.0};
@@ -737,7 +1020,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src,
   }
 
   // loss
-  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sumloss);
+  phi::WarpReduceSum<AccT, kBatchSize, kWarpSize>(sumloss);
 
   for (int i = 0; i < kBatchSize; i++) {
     if (i >= local_batches) break;
@@ -950,11 +1233,12 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       Tensor* loss = context.Output<Tensor>("Loss");
 
       const int rank = softmax->dims().size();
-      const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+      const int axis =
+          phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
       const int axis_dim = softmax->dims()[axis];
 
-      const int n = SizeToAxis(axis, softmax->dims());
-      const int d = SizeFromAxis(axis, softmax->dims());
+      const int n = phi::funcs::SizeToAxis(axis, softmax->dims());
+      const int d = phi::funcs::SizeFromAxis(axis, softmax->dims());
 
       auto* softmax_out_data =
           softmax_out->template mutable_data<T>(context.GetPlace());
@@ -1035,11 +1319,11 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     Tensor* loss = context.Output<Tensor>("Loss");
 
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     int axis_dim = logits->dims()[axis];
 
-    const int64_t n = SizeToAxis(axis, logits->dims());
-    const int64_t d = SizeFromAxis(axis, logits->dims());
+    const int64_t n = phi::funcs::SizeToAxis(axis, logits->dims());
+    const int64_t d = phi::funcs::SizeFromAxis(axis, logits->dims());
 
     auto* softmax_data = softmax->template mutable_data<T>(context.GetPlace());
     auto* loss_data = loss->template mutable_data<T>(context.GetPlace());
@@ -1118,11 +1402,11 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     T* logit_grad_data = logit_grad->template data<T>();
 
     const int rank = logit_grad->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     int axis_dim = logit_grad->dims()[axis];
 
-    const int64_t n = SizeToAxis(axis, logit_grad->dims());
-    const int64_t d = SizeFromAxis(axis, logit_grad->dims());
+    const int64_t n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
+    const int64_t d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
     const int64_t remain = d / axis_dim;
 
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index a7f88dd0ec38e55a7f1d0ea79436cdd376d14393..4b875cbf5841f661b55e668808051c8928b45cdd 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -84,7 +84,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
       Tensor* softmax_out = context.Output<Tensor>("Softmax");
       Tensor* loss = context.Output<Tensor>("Loss");
       const int rank = softmax->dims().size();
-      const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+      const int axis =
+          phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
       int axis_dim = softmax->dims()[axis];
 
       PADDLE_ENFORCE_GT(
@@ -97,7 +98,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
       softmax_out->mutable_data<T>(context.GetPlace());
       loss->mutable_data<T>(context.GetPlace());
 
-      const int n = SizeToAxis(axis, softmax->dims());
+      const int n = phi::funcs::SizeToAxis(axis, softmax->dims());
 
       PADDLE_ENFORCE_GT(
           n, 0, platform::errors::InvalidArgument(
@@ -105,7 +106,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
                     "SizeToAxis of softmax is %d.",
                     n));
 
-      const int d = SizeFromAxis(axis, softmax->dims());
+      const int d = phi::funcs::SizeFromAxis(axis, softmax->dims());
 
       Tensor softmax_2d, labels_2d, loss_2d, softmax_out_2d;
       softmax_2d.ShareDataWith(*softmax).Resize({n, d});
@@ -133,7 +134,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     Tensor* loss = context.Output<Tensor>("Loss");
 
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     int axis_dim = logits->dims()[axis];
     PADDLE_ENFORCE_GT(
         axis_dim, 0,
@@ -145,14 +146,14 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    const int n = SizeToAxis(axis, logits->dims());
+    const int n = phi::funcs::SizeToAxis(axis, logits->dims());
     PADDLE_ENFORCE_GT(
         n, 0, platform::errors::InvalidArgument(
                   "The size of axis should be larger than 0, but received "
                   "SizeToAxis of logits is %d.",
                   n));
 
-    const int d = SizeFromAxis(axis, logits->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
     Tensor logits_2d, softmax_2d, labels_2d, loss_2d;
     logits_2d.ShareDataWith(*logits).Resize({n, d});
     softmax_2d.ShareDataWith(*softmax).Resize({n, d});
@@ -192,7 +193,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     auto ignore_index = context.Attr<int>("ignore_index");
 
     const int rank = logit_grad->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     int axis_dim = logit_grad->dims()[axis];
     PADDLE_ENFORCE_GT(
         axis_dim, 0,
@@ -201,14 +202,14 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
             "axis dimention is %d.",
             axis_dim));
 
-    const int n = SizeToAxis(axis, logit_grad->dims());
+    const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
     PADDLE_ENFORCE_GT(
         n, 0, platform::errors::InvalidArgument(
                   "The size of axis should be larger than 0, but received "
                   "SizeToAxis of logit_grad is %d.",
                   n));
 
-    const int d = SizeFromAxis(axis, logit_grad->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
     Tensor logit_grad_2d, labels_2d, out_grad_2d;
     logit_grad_2d.ShareDataWith(*logit_grad).Resize({n, d});
     labels_2d.ShareDataWith(labels).Resize({n, labels.numel() / n});
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
index a51f68530caf88a8f5abe2b4615180266f409a8c..1cd6f8b7698b949a8e198c766fcf193e13481298 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
@@ -87,7 +87,7 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
                         platform::errors::InvalidArgument(
                             "If soft_label=False, axis must be -1 or"
                             " can be regard as last dimention in mlu kernel."));
-      framework::Tensor labels_int32(framework::TransToPtenDataType(VT::INT32));
+      framework::Tensor labels_int32(framework::TransToPhiDataType(VT::INT32));
       labels_int32.Resize(labels->dims());
       labels_int32.mutable_data<int32_t>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
index a5576ab5af3fdeb38b6c1fe87aff32dab412c0be..1f1fbea090c13f2eff7e389c9b7c4774ccbb7700 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
@@ -40,15 +41,16 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
                           "the npu kernel of softmax_with_cross_entropy."));
 
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    const int n = SizeToAxis(axis, logits->dims());
-    const int d = SizeFromAxis(axis, logits->dims());
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int n = phi::funcs::SizeToAxis(axis, logits->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
 
     PADDLE_ENFORCE_EQ(
         labels->numel(), n,
         platform::errors::Unimplemented(
-            "The size of labels should be equal to SizeToAxis of logits,"
-            "but got size of labels is %d and SizeToAxis is %d.",
+            "The size of labels should be equal to phi::funcs::SizeToAxis of "
+            "logits,"
+            "but got size of labels is %d and phi::funcs::SizeToAxis is %d.",
             labels->numel(), n));
 
     loss->mutable_data<T>(ctx.GetPlace());
@@ -97,9 +99,9 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
     logits_grad->mutable_data<T>(ctx.GetPlace());
 
     const int rank = logits_grad->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    const int n = SizeToAxis(axis, logits_grad->dims());
-    const int d = SizeFromAxis(axis, logits_grad->dims());
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int n = phi::funcs::SizeToAxis(axis, logits_grad->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logits_grad->dims());
 
     Tensor logits_grad_2d, loss_grad_1d, backprop_2d;
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
index 650e488c5e10b8153e4399b0c9eb4fe38a05b215..d9149b85c6a0f15a27dccf3564e50838e34b00c8 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -38,13 +38,13 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
     Tensor* softmax = context.Output<Tensor>("Softmax");
     Tensor* loss = context.Output<Tensor>("Loss");
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument(
                                           "axis should == rank - 1"));
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
-    const int n = SizeToAxis(axis, logits->dims());
-    const int d = SizeFromAxis(axis, logits->dims());
+    const int n = phi::funcs::SizeToAxis(axis, logits->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
     std::vector<int> logits_dims = phi::vectorize<int>(logits->dims());
     const bool soft_label = context.Attr<bool>("soft_label");
 
@@ -122,11 +122,11 @@ class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel<T> {
     auto ignore_index = context.Attr<int>("ignore_index");
 
     const int rank = logit_grad->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument(
                                           "axis should == rank - 1"));
-    const int n = SizeToAxis(axis, logit_grad->dims());
-    const int d = SizeFromAxis(axis, logit_grad->dims());
+    const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
 
     auto& dev_ctx =
         context.template device_context<platform::XPUDeviceContext>();
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
index fe76448a185c956d8c08c600f0e0f887e2d057b8..db3dc214bfe7ae7ae7facc59deca71ce9dfe91f6 100644
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -25,9 +25,10 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
 
 #if defined(PADDLE_WITH_ONEMKL)
-#include "paddle/fluid/platform/dynload/mklrt.h"
+#include "paddle/phi/backends/dynload/mklrt.h"
 #elif defined(PADDLE_WITH_POCKETFFT)
 #include "extern_pocketfft/pocketfft_hdronly.h"
 #endif
@@ -357,12 +358,12 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
 // FFT Functors
 #if defined(PADDLE_WITH_ONEMKL)
 
-#define MKL_DFTI_CHECK(expr)                                       \
-  do {                                                             \
-    MKL_LONG status = (expr);                                      \
-    if (!platform::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \
-      PADDLE_THROW(platform::errors::External(                     \
-          platform::dynload::DftiErrorMessage(status)));           \
+#define MKL_DFTI_CHECK(expr)                                                   \
+  do {                                                                         \
+    MKL_LONG status = (expr);                                                  \
+    if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR))                  \
+      PADDLE_THROW(                                                            \
+          platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
   } while (0);
 
 namespace {
@@ -370,7 +371,7 @@ namespace {
 struct DftiDescriptorDeleter {
   void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
     if (handle != nullptr) {
-      MKL_DFTI_CHECK(platform::dynload::DftiFreeDescriptor(&handle));
+      MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
     }
   }
 };
@@ -385,7 +386,7 @@ class DftiDescriptor {
                           "DftiDescriptor has already been initialized."));
 
     DFTI_DESCRIPTOR* raw_desc;
-    MKL_DFTI_CHECK(platform::dynload::DftiCreateDescriptorX(
+    MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
         &raw_desc, precision, signal_type, signal_ndim, sizes));
     desc_.reset(raw_desc);
   }
@@ -437,21 +438,21 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
   descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
 
   // placement inplace or not inplace
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
-      descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
+                                            DFTI_NOT_INPLACE));
 
   // number of transformations
   const MKL_LONG batch_size = fft_sizes[0];
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
       descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
 
   // input & output distance
   const MKL_LONG idist = in_strides[0];
   const MKL_LONG odist = out_strides[0];
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(),
-                                                 DFTI_INPUT_DISTANCE, idist));
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(),
-                                                 DFTI_OUTPUT_DISTANCE, odist));
+  MKL_DFTI_CHECK(
+      phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
+                                            DFTI_OUTPUT_DISTANCE, odist));
 
   // input & output stride
   std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
@@ -460,14 +461,14 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
     mkl_in_stride[i] = in_strides[i];
     mkl_out_stride[i] = out_strides[i];
   }
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
       descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
       descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
 
   // conjugate even storage
   if (!(fft_type == FFTTransformType::C2C)) {
-    MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+    MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
         descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
   }
 
@@ -489,12 +490,12 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
         return DFTI_BACKWARD_SCALE;
       }
     }();
-    MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(),
-                                                   scale_direction, scale));
+    MKL_DFTI_CHECK(
+        phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
   }
 
   // commit the descriptor
-  MKL_DFTI_CHECK(platform::dynload::DftiCommitDescriptor(descriptor.get()));
+  MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
   return descriptor;
 }
 
@@ -575,39 +576,39 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
                     framework::TransToProtoVarType(out->dtype()), input_stride,
                     output_stride, signal_sizes, normalization, forward);
 
-  const FFTTransformType fft_type = GetFFTTransformType(x->type(), out->type());
+  const FFTTransformType fft_type =
+      GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
+                          framework::TransToProtoVarType(out->type()));
   if (fft_type == FFTTransformType::C2R && forward) {
-    framework::Tensor collapsed_input_conj(
-        framework::TransToProtoVarType(collapsed_input.dtype()));
+    framework::Tensor collapsed_input_conj(collapsed_input.dtype());
     collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
                                           ctx.GetPlace());
     // conjugate the input
     platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
-    math::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
-                                  collapsed_input.numel(),
-                                  collapsed_input_conj.data<Ti>());
+    phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
+                                        collapsed_input.numel(),
+                                        collapsed_input_conj.data<Ti>());
     for_range(functor);
-    MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
         desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
   } else if (fft_type == FFTTransformType::R2C && !forward) {
-    framework::Tensor collapsed_output_conj(
-        framework::TransToProtoVarType(collapsed_output.dtype()));
+    framework::Tensor collapsed_output_conj(collapsed_output.dtype());
     collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
                                            ctx.GetPlace());
-    MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
         desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
     // conjugate the output
     platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
-    math::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
-                                  collapsed_output.numel(),
-                                  collapsed_output.data<To>());
+    phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
+                                        collapsed_output.numel(),
+                                        collapsed_output.data<To>());
     for_range(functor);
   } else {
     if (forward) {
-      MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
           desc.get(), collapsed_input.data(), collapsed_output.data()));
     } else {
-      MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
           desc.get(), collapsed_input.data(), collapsed_output.data()));
     }
   }
diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc
index f04ba72a1e181654466acac52ffe58cd74cdc2da..a7c7e33f58af6ce8f59a301d1fc5ccdf511b608f 100644
--- a/paddle/fluid/operators/test_common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 
 USE_OP(relu);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/top_k_op_mlu.cc b/paddle/fluid/operators/top_k_op_mlu.cc
index a9f835f6fe2c25d6ffdfae93e1c7cd170db6b891..102902bdaaaaf4a6a94699f561a5e91213be8c44 100644
--- a/paddle/fluid/operators/top_k_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_op_mlu.cc
@@ -47,7 +47,7 @@ class TopkMLUKernel : public framework::OpKernel<T> {
     const bool sorted = true;
     const int axis = -1;
     // cnnl only support int32/int16 type of indices
-    framework::Tensor indices_int32(framework::TransToPtenDataType(VT::INT32));
+    framework::Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
     indices_int32.Resize(indices->dims());
     indices_int32.mutable_data<int32_t>(place);
 
diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc
index 7bada0179a1c5e73669b07fd77171f764db6e21c..5b8a6b3e75449508afa5d316d81f97ab815c9ea9 100644
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
@@ -55,7 +55,7 @@ class TopkV2MLUKernel : public framework::OpKernel<T> {
     indices->mutable_data<int64_t>(place);
 
     // cnnl only support int32/int16 type of indices
-    framework::Tensor indices_int32(framework::TransToPtenDataType(VT::INT32));
+    framework::Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
     indices_int32.Resize(indices->dims());
     indices_int32.mutable_data<int32_t>(place);
 
diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc
index bd3dc002990a7cf3af738eb2d914b3fc3dd9e79a..54f4deac80a74e2e471036c2e25d08a582e29a9d 100644
--- a/paddle/fluid/operators/trunc_op.cc
+++ b/paddle/fluid/operators/trunc_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,14 +23,6 @@ namespace operators {
 class TruncOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "trunc");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "trunc");
-    auto input_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", input_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class TruncOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -75,9 +69,13 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker,
                   ops::TruncGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TruncGradOpMaker<paddle::imperative::OpBase>);
+                  ops::TruncGradOpMaker<paddle::imperative::OpBase>,
+                  TruncInferShapeFunctor);
 
 REGISTER_OPERATOR(trunc_grad, ops::TruncGradOp);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index f980e007271e3cfd8cf80c2f69ee32cde12aff0f..6eb7f922dfdbec41aa1c47d11e1decc259d08689 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -23,28 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                           1.0);
-    TruncatedNormal<T> truncated_normal(mean, std);
-    int64_t size = tensor->numel();
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = truncated_normal(dist(*engine));
-    }
-  }
-};
-
 class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -124,5 +102,3 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
                              ops::TruncatedGaussianRandomOp,
                              ops::TruncatedGaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(truncated_gaussian_random,
-                       ops::CPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
deleted file mode 100644
index 5e530a5bb5248e79d6ba19b23f86788a2eb3315f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include <limits>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GPUTruncatedNormal {
-  T mean, std;
-  T a_normal_cdf;
-  T b_normal_cdf;
-  unsigned int seed;
-  T numeric_min;
-
-  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
-      : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
-    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
-    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
-  }
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
-    rng.discard(n);
-    T value = dist(rng);
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
-  }
-};
-
-template <typename T>
-struct TruncatedNormalOffset {
-  T mean, std;
-  T a_normal_cdf;
-  T b_normal_cdf;
-  unsigned int seed;
-  T numeric_min;
-  int offset_;
-
-  __host__ __device__ TruncatedNormalOffset(T mean, T std, T numeric_min,
-                                            int seed, int offset)
-      : mean(mean),
-        std(std),
-        seed(seed),
-        numeric_min(numeric_min),
-        offset_(offset) {
-    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
-    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
-  }
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
-    rng.discard(n + offset_);
-    T value = dist(rng);
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
-  }
-};
-
-template <typename T>
-class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          TruncatedNormalOffset<T>(mean, std, std::numeric_limits<T>::min(),
-                                   seed_offset.first, gen_offset));
-    } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GPUTruncatedNormal<T>(
-                            mean, std, std::numeric_limits<T>::min(), seed));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    truncated_gaussian_random,
-    paddle::operators::GPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/unbind_op.cc b/paddle/fluid/operators/unbind_op.cc
index 3fce0f8f47d32a602d56e88b43ddb9bf3d4b15f8..f2fc08308c6b32868adc8057c9bc2a92c4247c60 100644
--- a/paddle/fluid/operators/unbind_op.cc
+++ b/paddle/fluid/operators/unbind_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/unbind_op.h"
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -79,11 +82,3 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(unbind, ops::UnbindOp, ops::UnbindOpMaker,
                   ops::UnbindGradMaker<paddle::framework::OpDesc>,
                   ops::UnbindGradMaker<paddle::imperative::OpBase>);
-namespace plat = paddle::platform;
-REGISTER_OP_CPU_KERNEL(
-    unbind, ops::UnbindOpKernel<plat::CPUDeviceContext, double>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, float>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, int>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, plat::float16>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/unbind_op.cu.cc b/paddle/fluid/operators/unbind_op.cu.cc
deleted file mode 100644
index cec7058d3cf52eff55eb88afaa217204a72e4566..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unbind_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unbind_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    unbind, ops::UnbindOpKernel<plat::CUDADeviceContext, double>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, float>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, int>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h
index 69808e3f9fe9ed4a92152fc89532a7470bf85f6f..6e35f262de420744b5299fbf1ab540e34c711d92 100644
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -34,27 +34,6 @@ static inline framework::DDim UnbindOutsDims(const framework::DDim in_dims,
   }
   return phi::make_ddim(out_dims);
 }
-template <typename DeviceContext, typename T>
-class UnbindOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = in->dims();
-    axis = axis < 0 ? in_dims.size() + axis : axis;
-    std::vector<const framework::Tensor*> shape_refer;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      outs[j]->mutable_data<T>(ctx.GetPlace());
-      shape_refer.emplace_back(outs[j]);
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SplitFunctor<DeviceContext, T> functor;
-    functor(dev_ctx, *in, shape_refer, axis, &outs);
-  }
-};
 
 template <typename T>
 class UnbindGradMaker : public framework::SingleGradOpMaker<T> {
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index 0a8cd6e65f93e080797a17eb110b10e53b8ddc69..c45b839d5b40bd1d0db25743406bb8cc319f1280 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -12,7 +12,9 @@
  *     See the License for the specific language governing permissions and
  *     limitations under the License. */
 
-#include "paddle/fluid/operators/unfold_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -60,126 +62,6 @@ feature map, a series of such columns will be formed.
 class UnfoldOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::NotFound("Input(X) of UnfoldOp should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Y"), true,
-        platform::errors::NotFound("Output(Y) of UnfoldOp should not be null"));
-    auto in_dims = ctx->GetInputDim("X");
-    std::vector<int> kernel_sizes =
-        ctx->Attrs().Get<std::vector<int>>("kernel_sizes");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    std::vector<int> dilations =
-        ctx->Attrs().Get<std::vector<int>>("dilations");
-
-    // Only [N, C, H, W] input supported now
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "Input should be 4-D tensor of format [N, C, H, W], but get %u",
-            in_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        in_dims.size() - kernel_sizes.size(), 2U,
-        platform::errors::InvalidArgument(
-            "The dims of X should be larger than that of kernel_sizes "
-            "by a number of 2, due to the batch size and input channel dim. "
-            "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
-            in_dims.size(), kernel_sizes.size()));
-    PADDLE_ENFORCE_EQ(
-        strides.size(), kernel_sizes.size(),
-        platform::errors::InvalidArgument(
-            "The dims of strides should be the same with that of kernel_sizes. "
-            "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
-            strides.size(), kernel_sizes.size()));
-    PADDLE_ENFORCE_EQ(
-        paddings.size(), 2 * strides.size(),
-        platform::errors::InvalidArgument(
-            "The dims of paddings should be 2 times of that of strides. "
-            "But recieved dims(paddings: %u) != 2*dims(strides: %u).",
-            paddings.size(), strides.size()));
-    PADDLE_ENFORCE_EQ(
-        strides.size(), dilations.size(),
-        platform::errors::InvalidArgument(
-            "The dims of strides should be the same with that of dilations. "
-            "But recieved dims(strides: %u) != dims(dilations: %u).",
-            strides.size(), dilations.size()));
-
-    // check kernel_sizes
-    PADDLE_ENFORCE_GT(kernel_sizes[0], 0,
-                      platform::errors::InvalidArgument(
-                          "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
-                          kernel_sizes[0], kernel_sizes[1]));
-    PADDLE_ENFORCE_GT(kernel_sizes[1], 0,
-                      platform::errors::InvalidArgument(
-                          "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
-                          kernel_sizes[0], kernel_sizes[1]));
-    // check strides
-    PADDLE_ENFORCE_GT(strides[0], 0,
-                      platform::errors::InvalidArgument(
-                          "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
-                          strides[0], strides[1]));
-    PADDLE_ENFORCE_GT(strides[1], 0,
-                      platform::errors::InvalidArgument(
-                          "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
-                          strides[0], strides[1]));
-    // check dilations
-    PADDLE_ENFORCE_GT(
-        dilations[0], 0,
-        platform::errors::InvalidArgument(
-            "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
-            dilations[0], dilations[1]));
-    PADDLE_ENFORCE_GT(
-        dilations[1], 0,
-        platform::errors::InvalidArgument(
-            "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
-            dilations[0], dilations[1]));
-
-    std::vector<int> out_dims;
-    out_dims.push_back(in_dims[0]);
-    int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
-    out_dims.push_back(output_channels);
-
-    int output_height =
-        CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0],
-                       paddings[2], strides[0]);
-    int output_width = CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1],
-                                      paddings[1], paddings[3], strides[1]);
-    if (ctx->IsRuntime()) {
-      // only check output height and width in runtime
-      PADDLE_ENFORCE_GT(
-          output_height, 0,
-          platform::errors::InvalidArgument(
-              "The sliding blocks calculated from input spatial size "
-              "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
-              "dilations (%d, %d), is (%d, %d), which should be a "
-              "positive integer.",
-              in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
-              strides[0], strides[1], dilations[0], dilations[1], output_height,
-              output_width));
-      PADDLE_ENFORCE_GT(
-          output_width, 0,
-          platform::errors::InvalidArgument(
-              "The sliding blocks calculated from input spatial size "
-              "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
-              "dilations (%d, %d), is (%d, %d), which should be a "
-              "positive integer.",
-              in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
-              strides[0], strides[1], dilations[0], dilations[1], output_height,
-              output_width));
-    }
-    int output_col_length = output_height * output_width;
-    out_dims.push_back(output_col_length);
-    ctx->SetOutputDim("Y", phi::make_ddim(out_dims));
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -237,16 +119,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
+                            PT_INFER_META(phi::UnfoldInferMeta));
 REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker,
                   ops::UnfoldGradMaker<paddle::framework::OpDesc>,
-                  ops::UnfoldGradMaker<paddle::imperative::OpBase>);
+                  ops::UnfoldGradMaker<paddle::imperative::OpBase>,
+                  UnfoldInferShapeFunctor);
 REGISTER_OPERATOR(unfold_grad, ops::UnfoldGradOp,
                   ops::UnfoldGradOpNoNeedBufferVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    unfold, ops::UnfoldOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnfoldOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    unfold_grad,
-    ops::UnfoldGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnfoldGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/unfold_op.cu b/paddle/fluid/operators/unfold_op.cu
deleted file mode 100644
index 46584506d431564cfc7af11072eee6c544f03564..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unfold_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unfold_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    unfold, ops::UnfoldOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnfoldOpKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    unfold_grad,
-    ops::UnfoldGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnfoldGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/unfold_op.h b/paddle/fluid/operators/unfold_op.h
deleted file mode 100644
index f35bce3abff2b272d589067d27d31b4d3c6191a2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unfold_op.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-inline int CalcOutputSize(int input_size, int filter_size, int dilation,
-                          int padding1, int padding2, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
-  return output_size;
-}
-
-template <typename DeviceContext, typename T>
-class UnfoldOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("X");
-    const int batch_size = static_cast<int>(input->dims()[0]);
-    Tensor* output = ctx.Output<Tensor>("Y");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto input_dims = input->dims();
-
-    int output_height =
-        CalcOutputSize(input_dims[2], kernel_sizes[0], dilations[0],
-                       paddings[0], paddings[2], strides[0]);
-    int output_width =
-        CalcOutputSize(input_dims[3], kernel_sizes[1], dilations[1],
-                       paddings[1], paddings[3], strides[1]);
-
-    framework::DDim input_shape({input_dims[1], input_dims[2], input_dims[3]});
-    framework::DDim output_matrix_shape({input_dims[1], kernel_sizes[0],
-                                         kernel_sizes[1], output_height,
-                                         output_width});
-
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-      im2col(dev_ctx, in_batch, dilations, strides, paddings, &out_batch);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnfoldGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* output_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    if ((!output_grad) || (!input_grad)) return;
-
-    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input_grad->dims()[0]);
-
-    auto input_dims = input_grad->dims();
-
-    int output_height =
-        CalcOutputSize(input_dims[2], kernel_sizes[0], dilations[0],
-                       paddings[0], paddings[2], strides[0]);
-    int output_width =
-        CalcOutputSize(input_dims[3], kernel_sizes[1], dilations[1],
-                       paddings[1], paddings[3], strides[1]);
-
-    framework::DDim input_shape({input_dims[1], input_dims[2], input_dims[3]});
-    framework::DDim output_matrix_shape({input_dims[1], kernel_sizes[0],
-                                         kernel_sizes[1], output_height,
-                                         output_width});
-
-    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, input_grad, static_cast<T>(0));
-    for (int i = 0; i < batch_size; i++) {
-      Tensor out_grad_batch =
-          output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-      Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
-      col2im(dev_ctx, out_grad_batch, dilations, strides, paddings,
-             &in_grad_batch);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu
index a5231354eb47ea3b5cdd802a9b77f7ba7e313c1e..b8d8467b7eba9f360d8b2043bd4ed3f63e42725a 100644
--- a/paddle/fluid/operators/uniform_random_inplace_op.cu
+++ b/paddle/fluid/operators/uniform_random_inplace_op.cu
@@ -12,130 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/uniform_random_op.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 namespace paddle {
 namespace operators {
-
-template <typename T>
-struct UniformGenerator {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
-                                       int diag_step, T diag_val)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-template <typename T>
-struct UniformGeneratorOffset {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  int offset_;
-  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
-                                             int diag_num, int diag_step,
-                                             T diag_val, int offset)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val),
-        offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-template <typename T>
-__global__ void fill_value(int64_t size, T* data, float value) {
-  for (int idx = threadIdx.x; idx < size; idx += blockDim.x) {
-    data[idx] = static_cast<T>(value);
-  }
-}
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random as uniform_random_op.cu.
 template <typename T>
 class GPUUniformRandomInplaceKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto out_var = ctx.OutputVar("Out");
-    auto* tensor = out_var->GetMutable<framework::LoDTensor>();
-    T* data = tensor->mutable_data<T>(ctx.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-
-    T min = static_cast<T>(ctx.Attr<float>("min"));
-    T max = static_cast<T>(ctx.Attr<float>("max"));
-    unsigned int diag_num =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
-    T diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    int device_id = ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
-                                    diag_step, diag_val, gen_offset));
-    } else {
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
-    }
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    UniformRandom<T>(context, tensor);
   }
 };
 
@@ -143,17 +30,15 @@ template <typename T>
 class GPUUniformRandomInplaceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#ifdef __HIPCC__
-    const int64_t kMaxBlockDim = 256;
-#else
-    const int64_t kMaxBlockDim = 512;
-#endif
     auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* data = dx->mutable_data<T>(ctx.GetPlace());
-
-    auto size = dx->numel();
-    int64_t kBlockDim = std::min(size, kMaxBlockDim);
-    fill_value<T><<<1, kBlockDim, 0>>>(size, data, static_cast<float>(0));
+    auto dims = vectorize(dx->dims());
+    const auto& dev_cxt =
+        ctx.template device_context<platform::CUDADeviceContext>();
+    float value = static_cast<float>(0.0f);
+    phi::FullKernel<T>(
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
+            paddle::platform::CUDADeviceContext>::TYPE&>(dev_cxt),
+        dims, value, phi::DataType::UNDEFINED, dx);
   }
 };
 
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 086c57527b48ffc940c029fb462afd6c22d86f98..fb38a6aded4cf173bb4c0dd96d131ff520b6701e 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,88 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
 
-DECLARE_bool(use_curand);
-
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct UniformGenerator {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
-                                       int diag_step, T diag_val)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-template <typename T>
-struct UniformGeneratorOffset {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  int offset_;
-  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
-                                             int diag_num, int diag_step,
-                                             T diag_val, int offset)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val),
-        offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
 template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
@@ -128,50 +51,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
     }
-    auto& dev_cxt =
-        context.template device_context<platform::CUDADeviceContext>();
-    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-
-    T min = static_cast<T>(context.Attr<float>("min"));
-    T max = static_cast<T>(context.Attr<float>("max"));
-    unsigned int diag_num =
-        static_cast<unsigned int>(context.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(context.Attr<int>("diag_step"));
-    T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      if (FLAGS_use_curand) {
-        using MT = typename details::MPTypeTrait<T>::Type;
-        distribution::uniform_distribution<MT> dist;
-        distribution::uniform_transform<MT> trans(min, max);
-        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
-                                                    trans);
-      } else {
-        auto seed_offset = gen_cuda->IncrementOffset(1);
-        int64_t gen_offset = size * seed_offset.second;
-        thrust::transform(
-            index_sequence_begin, index_sequence_begin + size,
-            thrust::device_ptr<T>(data),
-            UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
-                                      diag_step, diag_val, gen_offset));
-      }
-    } else {
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
-    }
+    UniformRandom<T>(context, tensor);
   }
 };
 
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index be6c3c740e692c17504fb36bd807c06768da2ee9..a864c48ad757411861b6d2b3be40361c347601f8 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -18,6 +18,16 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+DECLARE_bool(use_curand);
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/operators/index_impl.cu.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -102,5 +112,117 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
 
   return vec_new_shape;
 }
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
+                                       int diag_step, T diag_val)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
+                                             int diag_num, int diag_step,
+                                             T diag_val, int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T>
+void UniformRandom(const framework::ExecutionContext& context,
+                   framework::Tensor* tensor) {
+  int64_t size = tensor->numel();
+  auto& dev_cxt =
+      context.template device_context<platform::CUDADeviceContext>();
+  T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
+  if (size <= 0) return;
+  unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  T min = static_cast<T>(context.Attr<float>("min"));
+  T max = static_cast<T>(context.Attr<float>("max"));
+  unsigned int diag_num =
+      static_cast<unsigned int>(context.Attr<int>("diag_num"));
+  unsigned int diag_step =
+      static_cast<unsigned int>(context.Attr<int>("diag_step"));
+  T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
+  int device_id = context.GetPlace().GetDeviceId();
+  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      using MT = typename details::MPTypeTrait<T>::Type;
+      distribution::uniform_distribution<MT> dist;
+      distribution::uniform_transform<MT> trans(min, max);
+      distribution::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
+    } else {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func =
+          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
+                                    diag_step, diag_val, gen_offset);
+      IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
+    }
+  } else {
+    auto func =
+        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
+    IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
+  }
+}
+#endif
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu
index 54b0d5b69086cda3ebdefa76636aff734d1a150c..61a1691e4fe265035917ed2407d5e3e24aa6bd88 100644
--- a/paddle/fluid/operators/where_op.cu
+++ b/paddle/fluid/operators/where_op.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/where_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
@@ -20,6 +21,15 @@ namespace platform = paddle::platform;
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct CondFunctor {
+  HOSTDEVICE inline CondFunctor() {}
+
+  HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const {
+    return cond ? x : y;
+  }
+};
+
 template <typename T>
 __global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x,
                                 const T* y, T* out) {
@@ -63,10 +73,11 @@ class WhereKernel<platform::CUDADeviceContext, T>
     auto stream = context.cuda_device_context().stream();
     auto& dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
-    auto config = GetGpuLaunchConfig1D(dev_ctx, numel);
-    WhereCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        numel, cond_data, x_data, y_data, out_data);
+    auto functor = CondFunctor<T>();
+    std::vector<const framework::Tensor*> ins = {condition, X, Y};
+    std::vector<framework::Tensor*> outs = {out};
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index be02bac1aa0ef7462e15f9471a84f79a6007cfb5..04c8a329e5e1a3cc7177a09d592d46ba3ac700ec 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -28,7 +28,7 @@ cc_library(denormal SRCS denormal.cc DEPS)
 
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
 
-set(enforce_deps flags errors boost flags pten_enforce)
+set(enforce_deps flags errors boost flags phi_enforce)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
@@ -52,7 +52,7 @@ ELSE()
     cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
 ENDIF()
 
-cc_library(place SRCS place.cc DEPS enforce boost pten_place)
+cc_library(place SRCS place.cc DEPS enforce boost phi_place)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 IF(WITH_MKLDNN)
@@ -122,7 +122,7 @@ cc_library(init SRCS init.cc DEPS device_context custom_kernel)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
+    place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator)
 if(WITH_XPU)
   target_link_libraries(device_context xpu_context)
@@ -138,14 +138,24 @@ if(WITH_CNCL)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
-    target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
+    target_link_libraries(device_context gpu_info gpu_context phi_gpu_info)
     target_link_libraries(device_context gpu_resource_pool)
 endif()
-
+if (WITH_CUSTOM_DEVICE)
+    target_link_libraries(device_context custom_context)
+endif()
 if(WITH_ASCEND_CL)
     target_link_libraries(device_context npu_resource_pool)
 endif()
 
+if(WITH_MLU)
+    target_link_libraries(device_context mlu_resource_pool)
+endif()
+
+if(WITH_CUSTOM_DEVICE)
+    target_link_libraries(device_context custom_context)
+endif()
+
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
 # Manage all device event library
diff --git a/paddle/fluid/platform/cuda_device_guard.h b/paddle/fluid/platform/cuda_device_guard.h
index 40204c0ed83f94da9de11378cf49c652e4f63962..08beed532a7ec1bbc9cd866c90c938493a15f5c1 100644
--- a/paddle/fluid/platform/cuda_device_guard.h
+++ b/paddle/fluid/platform/cuda_device_guard.h
@@ -14,13 +14,28 @@
 
 #pragma once
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace platform {
 
 class CUDADeviceGuard {
  public:
-  explicit inline CUDADeviceGuard(int dev_id) {
+  explicit CUDADeviceGuard(int dev_id) { SetDeviceIndex(dev_id); }
+
+  explicit CUDADeviceGuard(const CUDAPlace& place)
+      : CUDADeviceGuard(place.device) {}
+
+  // create uninitialized CUDADeviceGuard
+  CUDADeviceGuard() {}
+
+  ~CUDADeviceGuard() {
+    if (prev_id_ != -1) {
+      platform::SetDeviceId(prev_id_);
+    }
+  }
+
+  inline void SetDeviceIndex(const int dev_id) {
     int prev_id = platform::GetCurrentDeviceId();
     if (prev_id != dev_id) {
       prev_id_ = prev_id;
@@ -28,10 +43,9 @@ class CUDADeviceGuard {
     }
   }
 
-  inline ~CUDADeviceGuard() {
-    if (prev_id_ != -1) {
-      platform::SetDeviceId(prev_id_);
-    }
+  void SetDevice(const CUDAPlace& place) {
+    int dev_id = place.device;
+    SetDeviceIndex(dev_id);
   }
 
   CUDADeviceGuard(const CUDADeviceGuard& o) = delete;
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 00f0cc2ac92bf7edac1766358f14651844570cd9..f7c13ec7ed5edc034813360322967b6cb4643087 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -1,12 +1,12 @@
 IF(WITH_GPU)
     add_subdirectory(cuda)
-    nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda)
 
     nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
     nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
 ELSEIF(WITH_ROCM)
     add_subdirectory(rocm)
-    hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda)
 
     hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
     hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
index 8f7fd3dcbc03a2a780c70ce4e8599db864747da1..85050038d5a8363b005ed2397c9f6c3c03f18b62 100644
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -1,4 +1,4 @@
 nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
 nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
 
-nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten)
+nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda phi)
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index 367fb3de47c781b47f1b7794e6e873d1f784d697..f17a814175fa0748475099d5cc033d274134357f 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -105,6 +105,18 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
   return float16(__shfl_xor_sync(mask, val.to_half(), width));
 }
 
+template <>
+__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask,
+                                                       bfloat16 val,
+                                                       int width) {
+#if defined(PADDLE_CUDA_BF16)
+  return bfloat16(__shfl_xor_sync(mask, static_cast<nv_bfloat16>(val), width));
+#else
+  PADDLE_ENFORCE(
+      false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11.");
+#endif
+}
+
 template <>
 __forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
     unsigned mask, paddle::platform::complex<float> val, int width) {
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index 3e070da546b2ae85c40bb0e9cae05cc30d6d22c1..8aec8e840f33273a3130355c751e635e4a3f6736 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -147,6 +147,101 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
   }
 }
 #endif
+
+// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )"
+// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up.
+template <typename T, typename std::enable_if<std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index,
+                                              const size_t numel, T value) {
+#if ((CUDA_VERSION < 10000) || \
+     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  CudaAtomicAdd(reinterpret_cast<platform::float16 *>(tensor) + index,
+                static_cast<platform::float16>(value));
+#else
+  // whether the address is 32-byte aligned.
+  __half *target_addr = reinterpret_cast<__half *>(tensor + index);
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);
+
+  if (aligned_half2 && index < (numel - 1)) {
+    __half2 value2;
+    value2.x = *reinterpret_cast<__half *>(&value);
+    value2.y = __int2half_rz(0);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2);
+
+  } else if (!aligned_half2 && index > 0) {
+    __half2 value2;
+    value2.x = __int2half_rz(0);
+    value2.y = *reinterpret_cast<__half *>(&value);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2);
+
+  } else {
+    atomicAdd(reinterpret_cast<__half *>(tensor) + index,
+              *reinterpret_cast<__half *>(&value));
+  }
+#endif
+}
+
+template <typename T, typename std::enable_if<!std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index,
+                                              const size_t numel, T value) {
+  CudaAtomicAdd(arr + index, value);
+}
+
+#ifdef PADDLE_WITH_CUDA
+/*
+ * One thead block deals with elementwise atomicAdd for vector of len.
+ * @in: [x1, x2, x3, ...]
+ * @out:[y1+x1, y2+x2, y3+x3, ...]
+ * */
+template <typename T, typename std::enable_if<!std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+}
+
+// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
+template <typename T, typename std::enable_if<std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+#if ((CUDA_VERSION < 10000) || \
+     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+#else
+  int i = 0;
+  int loops = len / 2 * 2;
+
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);
+
+  if (aligned_half2) {
+    for (i = tid * 2; i < loops; i += threads_per_block * 2) {
+      __half2 value2;
+      T value_1 = in[i];
+      T value_2 = in[i + 1];
+      value2.x = *reinterpret_cast<__half *>(&value_1);
+      value2.y = *reinterpret_cast<__half *>(&value_2);
+      atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
+    }
+    for (; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  } else {
+    for (int i = tid; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  }
+#endif
+}
+#endif
 #endif
 
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 1d6ccdc1280a9f9575c048c37700ecc7c8cd6892..1919f59f8c07f2a0a15393fe14f2055f8d0c19bf 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -56,6 +56,23 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
   }
 }
 
+inline ncclDataType_t ToNCCLDataType(experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return ncclFloat;
+  } else if (type == experimental::DataType::FLOAT64) {
+    return ncclDouble;
+  } else if (type == experimental::DataType::INT32) {
+    return ncclInt;
+  } else if (type == experimental::DataType::INT64) {
+    return ncclInt64;
+  } else if (type == experimental::DataType::FLOAT16) {
+    return ncclFloat16;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in nccl is not supported."));
+  }
+}
+
 // NOTE(minqiyang): according to the ncclGroupEnd documentations:
 // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
 // ncclGroupEnd will wait for all communicators to be initialized, which will
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
index 63897bd6717408bff4bd4db5e739b3ba64316350..61bf1905fdb74f084a60688094269b89c2a11c28 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -91,6 +91,13 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
   return float16(__shfl_xor(static_cast<float>(val), width));
 }
 
+template <>
+__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask,
+                                                       bfloat16 val,
+                                                       int width) {
+  return bfloat16(__shfl_xor(static_cast<float>(val), width));
+}
+
 template <>
 __forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
     unsigned mask, paddle::platform::complex<float> val, int width) {
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index d54c6a33ecbf53071956aaf4b9d342efa5746f65..acf914c5087d0ff11cda2d663a490e84a8c33216 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -13,9 +13,9 @@ IF(WITH_IPU)
     "ipu_device.cc"
   )
 
-  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph graph_helper)
-  cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart enforce)
-  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart graph_helper)
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper)
+  cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce)
+  add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC})
   add_dependencies(paddle_ipu ipu_backend)
   set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
   set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 8f2a7ef78c9824d7706be48f117a86b19c334b8a..e0b3b08a2313d0ba80e807494eb74612caf81fd5 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -43,17 +43,17 @@ void IpuBackend::Compile(Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
   VLOG(10) << "enter IpuBackend::Compile";
-  compiler_->Prepare();
-  executor_->SetCompilerResources(compiler_->GetResources());
-
-  compiler_->InitInputs(graph, feed_list);
-  compiler_->LowerConstants(graph, scope_);
-  compiler_->LowerWeights(graph, scope_);
-  compiler_->LowerBody(graph);
+  compiler_->Prepare(graph);
+  compiler_->InitInputs(feed_list);
+  compiler_->LowerConstants(scope_);
+  compiler_->LowerWeights(scope_);
+  compiler_->LowerBody();
   compiler_->InitOutputs(fetch_list);
   if (ipu_strategy_->is_training) {
-    compiler_->LowerOptimier(graph, scope_);
+    compiler_->LowerOptimizer(scope_);
   }
+  executor_->SetCompilerResources(compiler_->GetResources());
+
   is_compiled_ = true;
   // when call compile, means a new graph
   is_prepared_ = false;
@@ -95,11 +95,9 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
   ipu_strategy_ = &strategy;
   compiler_->SetIpuStrategy(strategy);
   executor_->SetIpuStrategy(strategy);
-}
-
-void IpuBackend::SetCustomOps(
-    const std::vector<IpuCustomOpIdentifier>& custom_ops) {
-  compiler_->SetCustomOps(custom_ops);
+  if (!strategy.custom_ops.empty()) {
+    compiler_->SetCustomOps(strategy.custom_ops);
+  }
 }
 
 void IpuBackend::SaveModelProto(const std::string& path) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index b12e2539258dfefe93e0828fa1a7341e21d62e70..1244192490c16c4cfb01ac1c5f195cc123c4ba16 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -71,7 +71,6 @@ class IpuBackend {
   const Scope *GetScope() { return scope_; }
   void SetIpuStrategy(const IpuStrategy &strategy);
   const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
-  void SetCustomOps(const std::vector<IpuCustomOpIdentifier> &custom_ops);
 
   // save compiled model to onnx
   void SaveModelProto(const std::string &path);
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index df2e456383e1754956810f254cd98651e3139bcf..cdb3f6f9b3e285728d5c372b51492e42027aadba 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -98,6 +98,19 @@ TO GetCastSigAttrAllowNull(std::string attr, OpDesc* op_desc) {
   }
 }
 
+GraphHelper::GraphHelper(const Graph* g) {
+  graph = g;
+  sorted_ops = framework::ir::TopologySortOperations(*g);
+  for (auto* node : g->Nodes()) {
+    nodes_id_map[node->id()] = node;
+    if (node->IsVar()) {
+      vars_name_map[node->Name()] = node;
+      sorted_vars_id.push_back(node->id());
+    }
+  }
+  std::sort(sorted_vars_id.begin(), sorted_vars_id.end());
+}
+
 Compiler::Compiler() { RegisterOpFunc(); }
 
 Compiler::~Compiler() {
@@ -105,9 +118,10 @@ Compiler::~Compiler() {
   resources_.reset();
 }
 
-void Compiler::Prepare() {
+void Compiler::Prepare(const Graph* graph) {
   builder_ = popart::Builder::create();
   resources_ = std::make_unique<CompilerResources>();
+  graph_helper_ = std::make_unique<GraphHelper>(graph);
 }
 
 void Compiler::RegisterOpFunc() {
@@ -171,93 +185,24 @@ void Compiler::RegisterOpFunc() {
 #undef INT_VEC
 }
 
-void Compiler::LowerBody(const Graph* graph) {
-  VLOG(10) << "enter Compiler::LowerBody";
-  auto nodes = framework::ir::TopologySortOperations(*graph);
-  for (auto* node : nodes) {
-    auto* op_desc = node->Op();
-    auto op_type = op_desc->Type();
-    VLOG(10) << "lowering op: " << op_type;
-
-    if (op_type == "popart_constant") {
-      // pass
-    } else if (op_type == "popart_optimizer") {
-      // pass
-    } else if (op_type == "popart_checkpointoutput") {
-      auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      auto output_ids = builder_->checkpointOutput(inputs);
-      InsertTensors(outputs, output_ids);
-    } else if (op_type == "popart_custom_op") {
-      auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      auto debug_context = BuildDebugContext(op_desc);
-      auto attributes = std::map<std::string, popart::any>{};
-      for (auto& attr : op_desc->GetAttrMap()) {
-        CustomOpAttrVisitor visitor(&attributes, attr.first);
-        boost::apply_visitor(visitor, attr.second);
-      }
-      auto __op_type =
-          BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
-      VLOG(10) << "Build graph from custom op: " << __op_type;
-      auto it = custom_ops_.find(__op_type);
-      auto output_ids =
-          builder_->customOp(it->second.popart_op, it->second.popart_op.version,
-                             inputs, outputs.size(), attributes, debug_context);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
-    } else if (op_type == "popart_printtensor") {
-      auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      auto debug_context = BuildDebugContext(op_desc);
-      auto print_gradient =
-          BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
-      auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
-      auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
-          inputs, print_gradient, debug_context, title);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
-    } else {
-      auto itr = name_function_.find(op_type);
-      if (itr != name_function_.end()) {
-        itr->second(node->Op());
-      } else {
-        PADDLE_THROW(platform::errors::NotFound(
-            "%s is not registered, please check for unsupported operators for "
-            "running on IPU",
-            op_type));
-      }
-    }
-  }
-  VLOG(10) << "leave Compiler::LowerBody";
-}
-
-void Compiler::InitInputs(Graph* graph,
-                          const std::vector<std::string>& feed_list) {
+void Compiler::InitInputs(const std::vector<std::string>& feed_list) {
   for (const auto& feed_name : feed_list) {
-    feed_list_.push_back(feed_name);
-    for (const Node* n : graph->Nodes()) {
-      if (n->IsVar()) {
-        auto* var_desc = n->Var();
-        if (feed_name == var_desc->Name()) {
-          VLOG(10) << "feed_name= " << var_desc->Name();
-          auto data_type = VarType2PopartType(var_desc->GetDataType());
-          popart::TensorInfo input_info{data_type, var_desc->GetShape()};
-          VLOG(10) << "popart input_info = " << input_info;
-          popart::TensorId tensor_id =
-              builder_->addInputTensor(input_info, feed_name);
-          VLOG(10) << "popart input tensor id = " << tensor_id;
-          resources_->inputs.push_back(tensor_id);
-          resources_->tensors.emplace(var_desc->Name(), tensor_id);
-        }
-      }
-    }
+    auto* node = graph_helper_->vars_name_map[feed_name];
+    auto* var_desc = node->Var();
+    VLOG(10) << "feed_name= " << var_desc->Name();
+    auto data_type = VarType2PopartType(var_desc->GetDataType());
+    popart::TensorInfo input_info{data_type, var_desc->GetShape()};
+    VLOG(10) << "popart input_info = " << input_info;
+    popart::TensorId tensor_id =
+        builder_->addInputTensor(input_info, feed_name);
+    VLOG(10) << "popart input tensor id = " << tensor_id;
+    resources_->inputs.push_back(tensor_id);
+    resources_->tensors.emplace(var_desc->Name(), tensor_id);
   }
 }
 
 void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
   for (const auto& fetch_name : fetch_list) {
-    fetch_list_.push_back(fetch_name);
     auto tensor = resources_->tensors.find(fetch_name);
     PADDLE_ENFORCE_NE(
         tensor, resources_->tensors.end(),
@@ -271,14 +216,10 @@ void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
   }
 }
 
-void Compiler::LowerConstants(const Graph* graph, const Scope* scope) {
+void Compiler::LowerConstants(const Scope* scope) {
   auto& kid_scope = scope->NewScope();
   VLOG(10) << "enter Compiler::LowerConstants";
-  for (auto* node : graph->Nodes()) {
-    if (!node->IsOp()) {
-      continue;
-    }
-
+  for (auto* node : graph_helper_->sorted_ops) {
     auto* op_desc = node->Op();
     auto op_type = op_desc->Type();
     if (op_type == "popart_constant") {
@@ -308,17 +249,16 @@ void Compiler::LowerConstants(const Graph* graph, const Scope* scope) {
   VLOG(10) << "leave Compiler::LowerConstants";
 }
 
-void Compiler::LowerWeights(const Graph* graph, const Scope* scope) {
+void Compiler::LowerWeights(const Scope* scope) {
   VLOG(10) << "enter Compiler::LowerWeights";
-  PADDLE_ENFORCE_NOT_NULL(scope,
-                          platform::errors::PreconditionNotMet(
-                              "You should call set_scope before LowerWeights"));
   // at this step, the graph doesn't contains optimizer related states
-  for (const auto* node : graph->Nodes()) {
+  for (auto id : graph_helper_->sorted_vars_id) {
+    auto* node = graph_helper_->nodes_id_map[id];
     if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
       if (node->Var()->Persistable() && node->inputs.empty()) {
         auto var_name = node->Var()->Name();
         if (resources_->tensors.count(var_name) != 0) {
+          VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
           continue;
         }
         VLOG(10) << "lowering weight: " << var_name;
@@ -344,12 +284,68 @@ void Compiler::LowerWeights(const Graph* graph, const Scope* scope) {
   VLOG(10) << "leave Compiler::LowerWeights";
 }
 
-void Compiler::LowerOptimier(const Graph* graph, const Scope* scope) {
-  for (auto* node : graph->Nodes()) {
-    if (!node->IsOp()) {
-      continue;
+void Compiler::LowerBody() {
+  VLOG(10) << "enter Compiler::LowerBody";
+  for (auto* node : graph_helper_->sorted_ops) {
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    VLOG(10) << "lowering op: " << op_type;
+
+    if (op_type == "popart_constant") {
+      // pass
+    } else if (op_type == "popart_optimizer") {
+      // pass
+    } else if (op_type == "popart_checkpointoutput") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto output_ids = builder_->checkpointOutput(inputs);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_custom_op") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto attributes = std::map<std::string, popart::any>{};
+      for (auto& attr : op_desc->GetAttrMap()) {
+        CustomOpAttrVisitor visitor(&attributes, attr.first);
+        boost::apply_visitor(visitor, attr.second);
+      }
+      auto __op_type =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
+      VLOG(10) << "Build graph from custom op: " << __op_type;
+      auto it = custom_ops_.find(__op_type);
+      auto output_ids =
+          builder_->customOp(it->second.popart_op, it->second.popart_op.version,
+                             inputs, outputs.size(), attributes, debug_context);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_printtensor") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto print_gradient =
+          BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
+      auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
+      auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
+          inputs, print_gradient, debug_context, title);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
+    } else {
+      auto itr = name_function_.find(op_type);
+      if (itr != name_function_.end()) {
+        itr->second(node->Op());
+      } else {
+        PADDLE_THROW(platform::errors::NotFound(
+            "%s is not registered, please check for unsupported operators for "
+            "running on IPU",
+            op_type));
+      }
     }
+  }
+  VLOG(10) << "leave Compiler::LowerBody";
+}
 
+void Compiler::LowerOptimizer(const Scope* scope) {
+  for (auto* node : graph_helper_->sorted_ops) {
     auto* op_desc = node->Op();
     auto op_type = op_desc->Type();
     if (op_type == "popart_optimizer") {
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
index 5576266b1a771682ef949c9825309b64c08c0531..5d1e8c2727d8f9ca36c9380584505dbfcabfb064 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -68,34 +68,29 @@ struct CompilerResources {
   std::unique_ptr<popart::Optimizer> optimizer;
 };
 
+// helper for lowering graph
+struct GraphHelper {
+  explicit GraphHelper(const Graph *);
+
+  const Graph *graph;
+  std::map<std::string, Node *> vars_name_map;
+  std::map<int, Node *> nodes_id_map;
+  std::vector<Node *> sorted_ops;
+  std::vector<int> sorted_vars_id;
+};
+
 class Compiler {
  public:
   Compiler();
   ~Compiler();
 
-  void RegisterOpFunc();
-  void Prepare();
-  void LowerBody(const Graph *graph);
-  void InitInputs(Graph *graph, const std::vector<std::string> &feed_list);
+  void Prepare(const Graph *graph);
+  void InitInputs(const std::vector<std::string> &feed_list);
   void InitOutputs(const std::vector<std::string> &fetch_list);
-  void LowerConstants(const Graph *graph, const Scope *scope);
-  void LowerWeights(const Graph *graph, const Scope *scope);
-  void LowerOptimier(const Graph *graph, const Scope *scope);
-
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::vector<std::string> &tensor_ids);
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::string &tensor_id);
-  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
-                              const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::string &tensor_id,
-                              const OpDesc *op_desc);
+  void LowerConstants(const Scope *scope);
+  void LowerWeights(const Scope *scope);
+  void LowerBody();
+  void LowerOptimizer(const Scope *scope);
 
   void SetIpuStrategy(const IpuStrategy &strategy) {
     ipu_strategy_ = &strategy;
@@ -112,21 +107,34 @@ class Compiler {
   void SaveModelProtoNoCheck(const std::string &path);
 
  private:
+  void RegisterOpFunc();
   std::vector<std::string> GetOpInputs(const OpDesc *op);
   const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
   popart::DebugContext BuildDebugContext(const OpDesc *op);
 
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::vector<std::string> &tensor_ids);
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::string &tensor_id);
+  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
+                        const OpDesc *op_desc);
+  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
+                        const OpDesc *op_desc);
+  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
+                              const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::string &tensor_id,
+                              const OpDesc *op_desc);
+
  private:
   std::unique_ptr<popart::Builder> builder_;
   std::unique_ptr<CompilerResources> resources_;
+  std::unique_ptr<GraphHelper> graph_helper_;
 
   using OpFunc = std::function<void(OpDesc *op_desc)>;
   std::unordered_map<std::string, OpFunc> name_function_;
 
-  // feed_list_ & fetch_list save paddle tensor id
-  std::vector<std::string> feed_list_;
-  std::vector<std::string> fetch_list_;
-
   const IpuStrategy *ipu_strategy_ = nullptr;
   std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
 };
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 91ab7f3f4f052707ce7ae57147169889cdc4c259..c124d58957fe642365bd5bbf074bc15bfd74c6ba 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -113,7 +113,7 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
     auto fetch_dtype = fetch_info.dataType();
     auto paddle_type = PopartType2VarType(fetch_dtype);
     tensor->mutable_data(ctx.GetPlace(),
-                         framework::TransToPtenDataType(paddle_type));
+                         framework::TransToPhiDataType(paddle_type));
     anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor));
     popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
   }
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 4a9b9c00cb75cd042bab527532de3314075e6dcd..943dfcc6cffb875fc3cebfc88e35adeaba47fd63 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -241,6 +241,15 @@ IpuStrategy::IpuStrategy() {
 #undef ADD_POPART_BOOL_OPTION_ALIAS
 #undef ADD_POPART_ENUM_OPTION_ALIAS
 
+  RegisterGetter(vector_options_getter, options_type, "custom_ops", "vector",
+                 [&]() {
+                   std::vector<std::string> res;
+                   for (auto x : custom_ops) {
+                     res.push_back(x.repr());
+                   }
+                   return res;
+                 });
+
   RegisterSetter(bool_options, "enable_manual_shard", [&](bool value) {
     if (value) {
       popart_options.virtualGraphMode = popart::VirtualGraphMode::Manual;
@@ -429,6 +438,14 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
   }
 }
 
+void IpuStrategy::AddCustomOp(const std::string& paddle_op,
+                              const std::string& popart_op,
+                              const std::string& domain, int version) {
+  LOG(INFO) << "IpuStrategy add custom op: " << paddle_op;
+  custom_ops.push_back(
+      IpuCustomOpIdentifier(paddle_op, popart_op, domain, version));
+}
+
 std::string IpuStrategy::GetOption(const std::string& option) {
   return get(option, options_getter);
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 0e2af26454c401960773de20744f285aecec6bed..64436dc14fec3393b0a2a4473ad436d7d08f5217 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <popart/patterns/patterns.hpp>
 #include <popart/sessionoptions.hpp>
 #include <popart/tensorlocation.hpp>
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -71,6 +72,9 @@ struct IpuStrategy {
   // popart pattern manager
   popart::Patterns popart_patterns;
 
+  // custom ops
+  std::vector<IpuCustomOpIdentifier> custom_ops;
+
  private:
   std::map<std::string, std::function<void(bool)>> bool_options;
   std::map<std::string, std::function<void(std::uint64_t)>> uint64_options;
@@ -123,6 +127,8 @@ struct IpuStrategy {
                               const std::string &value);
   void SetTensorLocation(const std::string &tensor, const std::string &option,
                          std::uint64_t value);
+  void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
+                   const std::string &domain, int version);
 
   std::string GetOption(const std::string &);
   std::vector<std::string> GetVectorOption(const std::string &);
diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt
index 724776bfad2339a1cc58cbca30768311ce0cbd3f..1f3a7670849c2c8a0e8eb87bcd5ef63709fe6ec4 100644
--- a/paddle/fluid/platform/device/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt
@@ -9,3 +9,4 @@ cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_man
 cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream)
 cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context)
 cc_library(mlu_collective_helper SRCS mlu_collective_helper.cc DEPS mlu_stream mlu_info)
+cc_library(mlu_resource_pool SRCS mlu_resource_pool.cc DEPS mlu_info)
diff --git a/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc b/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbe3eca1c4d23fc07bec30b6b7ed22c731944ad2
--- /dev/null
+++ b/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_MLU)
+#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+MluStreamResourcePool::MluStreamResourcePool() {
+  int dev_cnt = platform::GetMLUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetMLUDeviceId(dev_idx);
+      mluStream stream;
+      cnrtQueueCreate(&stream);
+      return stream;
+    };
+
+    auto deleter = [dev_idx](mluStream stream) {
+      platform::SetMLUDeviceId(dev_idx);
+      cnrtQueueDestroy(stream);
+    };
+
+    pool_.emplace_back(ResourcePool<MluStreamObject>::Create(creator, deleter));
+  }
+}
+
+MluStreamResourcePool& MluStreamResourcePool::Instance() {
+  static MluStreamResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<MluStreamObject> MluStreamResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+MluEventResourcePool::MluEventResourcePool() {
+  int dev_cnt = platform::GetMLUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetMLUDeviceId(dev_idx);
+      mluEventHandle event;
+      cnrtNotifierCreate(&event);
+      return event;
+    };
+
+    auto deleter = [dev_idx](mluEventHandle event) {
+      platform::SetMLUDeviceId(dev_idx);
+      cnrtNotifierDestroy(event);
+    };
+
+    pool_.emplace_back(ResourcePool<MluEventObject>::Create(creator, deleter));
+  }
+}
+
+MluEventResourcePool& MluEventResourcePool::Instance() {
+  static MluEventResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<MluEventObject> MluEventResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/mlu/mlu_resource_pool.h b/paddle/fluid/platform/device/mlu/mlu_resource_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0e2af7f024cb88a06f7e7bfa13c61d1a825a2a6
--- /dev/null
+++ b/paddle/fluid/platform/device/mlu/mlu_resource_pool.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_MLU)
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#include "paddle/fluid/platform/resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+using MluStreamObject = std::remove_pointer<mluStream>::type;
+using MluEventObject = std::remove_pointer<mluEventHandle>::type;
+
+class MluStreamResourcePool {
+ public:
+  std::shared_ptr<MluStreamObject> New(int dev_idx);
+
+  static MluStreamResourcePool &Instance();
+
+ private:
+  MluStreamResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(MluStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<MluStreamObject>>> pool_;
+};
+
+class MluEventResourcePool {
+ public:
+  std::shared_ptr<MluEventObject> New(int dev_idx);
+
+  static MluEventResourcePool &Instance();
+
+ private:
+  MluEventResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(MluEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<MluEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc
index 90c0851d79d8079d35c4bf035f130c9c86089c7e..d45492391dc88ce0c690e0768e080dd989a0539c 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
@@ -467,7 +467,7 @@ void NpuOpRunner::TypeAdapter(
     } else {
       tmp_inputs[i].Resize(inputs[i].dims());
       tmp_inputs[i].mutable_data(dev_ctx.GetPlace(),
-                                 framework::TransToPtenDataType(input_type[i]));
+                                 framework::TransToPhiDataType(input_type[i]));
 
       const auto &cast_runner = NpuOpRunner(
           "Cast", {inputs[i]}, {tmp_inputs[i]},
@@ -484,7 +484,7 @@ void NpuOpRunner::TypeAdapter(
     } else {
       tmp_outputs[i].Resize(outputs[i].dims());
       tmp_outputs[i].mutable_data(
-          dev_ctx.GetPlace(), framework::TransToPtenDataType(output_type[i]));
+          dev_ctx.GetPlace(), framework::TransToPhiDataType(output_type[i]));
     }
   }
 
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index 28573eb0c1e4ce2a8e6d7a2ba2d61edb6941ce51..b6a26f2554a131aab6e87146c241dc973d9c8f56 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
 
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place phi_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context op_kernel_type)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index e27d56642efde7c9b5b11901e57a938050672bf3..e6b08ed7bc340b5150078fe0deb6a3187fb8e17b 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -196,6 +196,7 @@ XPUOpMap& get_kl2_ops() {
       {"hard_swish_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"huber_loss_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"huber_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index aa020593454f8f74659ac1a6ba1e5205b2075ec6..f79ef8505d878b28125aaf84574942fb1698de8b 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -27,7 +27,10 @@ using XPUKernelSet =
 using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 
 XPUOpMap& get_kp_ops() {
-  static XPUOpMap s_xpu_kp_kernels{};
+  static XPUOpMap s_xpu_kp_kernels{
+      {"elementwise_add",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+  };
 
   return s_xpu_kp_kernels;
 }
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6452f6f7984e376ab686c7a417d2431af1045410..6a7956628f80464740e3cd812b0b663cc36d6fc6 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace memory {
@@ -171,6 +172,7 @@ inline void EmplaceDeviceContext(
                                     .get());
           dev_ctx->SetGenerator(framework::DefaultCPUGenerator().get());
         }
+        dev_ctx->SetHostGenerator(framework::DefaultCPUGenerator().get());
         dev_ctx->SetHostAllocator(
             memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(platform::CPUPlace())
@@ -322,7 +324,8 @@ NPUDeviceContext::~NPUDeviceContext() {
 }
 
 void NPUDeviceContext::Wait() const {
-  platform::RecordEvent record_event("NPUDeviceContext/wait");
+  platform::RecordEvent record_event("NPUDeviceContext/wait",
+                                     platform::TracerEventType::UserDefined, 2);
   VLOG(4) << "NPU context(" << this << ")  Wait";
   stream_->Wait();
 }
@@ -897,21 +900,13 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-CustomDeviceContext::CustomDeviceContext(CustomPlace place) : place_(place) {
-  DeviceGuard guard(place_);
-  stream_.reset(new stream::Stream());
-  stream_->Init(place_);
+CustomDeviceContext::CustomDeviceContext(CustomPlace place)
+    : phi::CustomContext(place) {
+  Init();
+  stream_.reset(new platform::stream::Stream(place, stream()));
 }
 
 CustomDeviceContext::~CustomDeviceContext() {}
-
-const Place& CustomDeviceContext::GetPlace() const { return place_; }
-
-void CustomDeviceContext::Wait() const {
-  // platform::RecordEvent record_event("NPUDeviceContext/wait");
-  VLOG(4) << "CustomDevice context(" << this << ")  Wait";
-  stream_->Wait();
-}
 #endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 0101286f0dfa87f3bc3b9ff0aae1e6f7342bace7..e9124dfc1f8a7ad3a88c843c1a1573ba3503d80b 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/device_context.h"
 
@@ -73,7 +74,10 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/device_ext.h"
 #include "paddle/fluid/platform/device/stream.h"
+
+#if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__)
 #include "unsupported/Eigen/CXX11/Tensor"
+#endif
 
 namespace Eigen {
 struct DefaultDevice;
@@ -819,17 +823,12 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-class CustomDeviceContext : public DeviceContext {
+class CustomDeviceContext : public phi::CustomContext {
  public:
   explicit CustomDeviceContext(CustomPlace place);
   virtual ~CustomDeviceContext();
 
-  const Place& GetPlace() const override;
-  void Wait() const override;
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  C_Stream stream() const {
-    return reinterpret_cast<C_Stream>(stream_->raw_stream());
-  }
 
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
@@ -839,13 +838,7 @@ class CustomDeviceContext : public DeviceContext {
   void WaitStreamCallback() const { return stream_->WaitCallback(); }
 
  private:
-  std::string device_type_;
-
-  CustomPlace place_;
-
   std::shared_ptr<platform::stream::Stream> stream_;
-
-  CustomDeviceContext();
 };
 template <>
 struct DefaultDeviceContextType<platform::CustomPlace> {
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 49391a65b185b45b35edac5d6217a2e4095b4c4a..87aa5dcde626bafd5e605cc9e35de7cf1b589569 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader)
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce phi_dynamic_loader)
 
 list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
 
@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 if(WITH_ROCM)
-  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
+  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader phi_dynload_cuda)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc)
 elseif (WITH_ASCEND_CL)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl phi_dynload_warpctc)
 else()
-  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
+  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader phi_dynload_cuda)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc)
 endif()
 if (WITH_MKLML)
-    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml)
+    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml phi_dynload_mklml)
 endif()
 
-cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack)
+cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader phi_dynload_lapack)
 add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
 if (MKL_FOUND AND WITH_ONEMKL)
   message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt)
+  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader phi_dynload_mklrt)
   target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index dacfe2bd2e7f584847abc4c39114061073770e88..854e5a7b9f04a63e43e4e910c26d4e592651c125 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUPTI
 
 #include <cuda.h>
+#include <cuda_occupancy.h>
 #include <cupti.h>
 #include <mutex>  // NOLINT
 
@@ -50,7 +51,8 @@ namespace dynload {
   __macro(cuptiSubscribe);                    \
   __macro(cuptiUnsubscribe);                  \
   __macro(cuptiEnableCallback);               \
-  __macro(cuptiEnableDomain);
+  __macro(cuptiEnableDomain);                 \
+  __macro(cudaOccMaxActiveBlocksPerMultiprocessor);
 
 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
 
diff --git a/paddle/fluid/platform/dynload/mklrt.h b/paddle/fluid/platform/dynload/mklrt.h
index 3b7d23277e065d007d2049d7a63ed3af7e1fdbdb..334b98a1c3d5ab9442dfd2ca6b7c5d7055e94559 100644
--- a/paddle/fluid/platform/dynload/mklrt.h
+++ b/paddle/fluid/platform/dynload/mklrt.h
@@ -17,7 +17,8 @@ limitations under the License. */
 #include <mkl_dfti.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/mklrt.h"
 #include "paddle/phi/backends/dynload/port.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index bc29a0472041afbbff84fa346f4dd0f1535925b6..c2d7eef58236952501020d49695356a1a952bc20 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -37,7 +37,7 @@ void* GetTensorRtPluginHandle();
 extern std::once_flag tensorrt_plugin_dso_flag;
 extern void* tensorrt_plugin_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name)                    \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP_(__name)                   \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
     void* operator()(Args... args) {                                          \
@@ -55,7 +55,7 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)                \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP_(__name)               \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
@@ -72,7 +72,7 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP_(__name)                     \
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
@@ -109,10 +109,10 @@ extern void* tensorrt_plugin_dso_handle;
 #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
   __macro(initLibNvInferPlugins);
 
-TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP_)
 TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
-    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
-TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
+    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP_)
+TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP_)
 
 #endif  // end of NV_TENSORRT_MAJOR
 
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 39f95a9295661b2b3432d7ca062b2bdb1fe5c40a..baf043e860be4fd6b0f3b82a43bc5594a083e6eb 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -761,3 +761,15 @@ DEFINE_bool(enable_slotrecord_reset_shrink, false,
             "enable slotrecord obejct reset shrink memory, default false");
 DEFINE_bool(enable_ins_parser_file, false,
             "enable parser ins file , default false");
+
+/**
+ * ProcessGroupNCCL related FLAG
+ * Name: nccl_blocking_wait
+ * Since Version:
+ * Value Range: bool, default=false
+ * Example:
+ * Note: nccl blocking wait.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
+#endif
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
index f3f7064efeeb2e1121c09a29473a4a81a063f849..abc427a3ca8815ecf193e4f9213223aa79069ea5 100644
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -13,136 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace paddle {
 namespace platform {
 
 template <typename DeviceContext>
-struct ForRange {
-  ForRange(const DeviceContext& dev_ctx, size_t limit);
-
-  template <typename Function>
-  void operator()(Function func) const;
-};
-
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<CPUDeviceContext> {
-  ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    for (size_t i = 0; i < limit_; ++i) {
-      func(i);
-    }
-  }
-
-  size_t limit_;
-};
-
-template <>
-struct ForRange<phi::CPUContext> {
-  ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    for (size_t i = 0; i < limit_; ++i) {
-      func(i);
-    }
-  }
-
-  size_t limit_;
-};
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename Function>
-__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
-  size_t idx = static_cast<size_t>(threadIdx.x);
-  func(idx);
-}
-
-template <typename Function>
-__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
-  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
-  if (idx < limit) {
-    func(idx);
-  }
-}
-
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<CUDADeviceContext> {
-  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
-
-  template <typename Function>
-  inline void operator()(Function func) const {
-#ifdef __HIPCC__
-    // HIP will throw core dump when threads > 256
-    constexpr int num_threads = 256;
-#elif WITH_NV_JETSON
-    // JETSON_NANO will throw core dump when threads > 128
-    int num_thread = 256;
-    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
-    const int num_threads = num_thread;
-#else
-    constexpr int num_threads = 1024;
-#endif
-    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
-    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
-    } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
-    }
-  }
-
-  const CUDADeviceContext& dev_ctx_;
-  size_t limit_;
-};
-
-template <>
-struct ForRange<phi::GPUContext> {
-  ForRange(const phi::GPUContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
-
-  template <typename Function>
-  inline void operator()(Function func) const {
-#ifdef __HIPCC__
-    // HIP will throw core dump when threads > 256
-    constexpr int num_threads = 256;
-#elif WITH_NV_JETSON
-    // JETSON_NANO will throw core dump when threads > 128
-    int num_thread = 256;
-    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
-    const int num_threads = num_thread;
-#else
-    constexpr int num_threads = 1024;
-#endif
-    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
-    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
-    } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
-    }
-  }
-
-  const phi::GPUContext& dev_ctx_;
-  size_t limit_;
-};
-
-#endif
+using ForRange = phi::funcs::ForRange<DeviceContext>;
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 285c6a4c130530987f3f63b1eecdf2ed1593ef09..01de7349f4823a66b2d180f3d1493477f361273a 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -1056,7 +1056,7 @@ class ReorderMKLDNNHandler {
                                                  platform::Place place) {
     auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt);
     auto dst_data = output->mutable_data(
-        place, framework::TransToPtenDataType(vtype_dst_), dst_md.get_size());
+        place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size());
     return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
   }
 
@@ -1065,7 +1065,7 @@ class ReorderMKLDNNHandler {
       const MKLDNNMemoryFormat& fmt, platform::Place place) {
     auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt);
     auto dst_data = output->mutable_data(
-        place, framework::TransToPtenDataType(vtype_dst_), dst_md.get_size());
+        place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size());
     return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
   }
 
diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc
index 58d37783d059709417707a767f306c20d2c65b67..36dd7891d5518681140a86215cb6f0792ee1bdd7 100644
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -95,8 +95,6 @@ std::unordered_map<uint64_t, ThreadId> GetAllThreadIds() {
   return res;
 }
 
-static constexpr const char* kDefaultThreadName = "unset";
-
 std::string GetCurrentThreadName() {
   const auto& thread_name =
       internal::ThreadDataRegistry<std::string>::GetInstance()
@@ -112,7 +110,7 @@ std::unordered_map<uint64_t, std::string> GetAllThreadNames() {
 bool SetCurrentThreadName(const std::string& name) {
   auto& instance = internal::ThreadDataRegistry<std::string>::GetInstance();
   const auto& cur_name = instance.GetCurrentThreadData();
-  if (!cur_name.empty() || cur_name == kDefaultThreadName) {
+  if (!cur_name.empty() || name.empty() || name == kDefaultThreadName) {
     return false;
   }
   instance.SetCurrentThreadData(name);
diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h
index 7f607aaec9763dfe3d76998517b2114218de5e5f..ef894fd3dc28174e01412cfbda83e58482f6ab6d 100644
--- a/paddle/fluid/platform/os_info.h
+++ b/paddle/fluid/platform/os_info.h
@@ -57,7 +57,8 @@ ThreadId GetCurrentThreadId();
 // create/destory when using it.
 std::unordered_map<uint64_t, ThreadId> GetAllThreadIds();
 
-// Returns 'unset' if SetCurrentThreadName is never called.
+static constexpr const char* kDefaultThreadName = "unset";
+// Returns kDefaultThreadName if SetCurrentThreadName is never called.
 std::string GetCurrentThreadName();
 
 // Return the map from StdTid to ThreadName
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
old mode 100644
new mode 100755
index 320e989bd9bb1881e7f1ad0d6d5506fb6e313e24..5acdfa39569f037fb0db5fbb0037f6ce42d2bac0
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -1,8 +1,11 @@
 cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
 cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
-cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer)
 cc_library(event_node SRCS event_node.cc DEPS enforce)
-cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node)
+cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
+cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils)
 cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
-cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
 add_subdirectory(dump)
+cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
+cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
+cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization)
+cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
new file mode 100644
index 0000000000000000000000000000000000000000..672a9a154535a1cb76a4bbc2bde074b6eecefd9e
--- /dev/null
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/profiler/cpu_utilization.h"
+
+namespace paddle {
+namespace platform {
+
+#ifdef _MSC_VER
+static uint64_t FileTimeToUint64(FILETIME time) {
+  uint64_t low_part = time.dwLowDateTime;
+  uint64_t high_part = time.dwHighDateTime;
+  uint64_t result = (high_part << 32) | low_part;
+  return result;
+}
+#endif
+
+void CpuUtilization::RecordBeginTimeInfo() {
+#if defined(_MSC_VER)
+  HANDLE process_handle = GetCurrentProcess();
+  GetSystemTimeAsFileTime(&start_);
+  GetSystemTimes(&system_idle_time_start_, &system_kernel_time_start_,
+                 &system_user_time_start_);
+  GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_,
+                  &process_kernel_time_start_, &process_user_time_start_);
+
+#elif defined(__linux__)
+  start_ = times(&process_tms_start_);
+#define proc_path_size 1024
+  static char proc_stat_path[proc_path_size] = "/proc/stat";
+  FILE *stat_file = fopen(proc_stat_path, "r");
+  if (stat_file != nullptr) {
+    char temp_str[200];
+    uint64_t temp_lu;
+    while (true) {
+      int retval = fscanf(
+          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+          temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
+          &system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
+          &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
+      if (std::string(temp_str).find("cpu") != 0) {
+        break;
+      }
+      if (retval != 11) {
+        return;
+      }
+    }
+    fclose(stat_file);
+  }
+#else
+#endif
+}
+
+void CpuUtilization::RecordEndTimeInfo() {
+#if defined(_MSC_VER)
+  HANDLE process_handle = GetCurrentProcess();
+  GetSystemTimeAsFileTime(&end_);
+  GetSystemTimes(&system_idle_time_end_, &system_kernel_time_end_,
+                 &system_user_time_end_);
+  GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_,
+                  &process_kernel_time_end_, &process_user_time_end_);
+#elif defined(__linux__)
+  end_ = times(&process_tms_end_);
+#define proc_path_size 1024
+  static char proc_stat_path[proc_path_size] = "/proc/stat";
+  FILE *stat_file = fopen(proc_stat_path, "r");
+  if (stat_file != nullptr) {
+    char temp_str[200];
+    uint64_t temp_lu;
+    while (true) {
+      int retval = fscanf(
+          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+          temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
+          &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
+          &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
+      if (std::string(temp_str).find("cpu") != 0) {
+        break;
+      }
+      if (retval != 11) {
+        return;
+      }
+    }
+    fclose(stat_file);
+  }
+#else
+#endif
+}
+
+float CpuUtilization::GetCpuUtilization() {
+  float cpu_utilization = 0.0;
+#if defined(_MSC_VER)
+  uint64_t system_user_time_start = FileTimeToUint64(system_user_time_start_);
+  uint64_t system_user_time_end = FileTimeToUint64(system_user_time_end_);
+  uint64_t system_kernel_time_start =
+      FileTimeToUint64(system_kernel_time_start_);
+  uint64_t system_kernel_time_end = FileTimeToUint64(system_kernel_time_end_);
+  uint64_t system_idle_time_start = FileTimeToUint64(system_idle_time_start_);
+  uint64_t system_idle_time_end = FileTimeToUint64(system_idle_time_end_);
+  float busy_time = (system_kernel_time_end - system_kernel_time_start) +
+                    (system_user_time_end - system_user_time_start);
+  float idle_time = system_idle_time_end - system_idle_time_start;
+  cpu_utilization = busy_time / (busy_time + idle_time);
+
+#elif defined(__linux__)
+  float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) +
+                    (system_tms_end_.tms_stime - system_tms_start_.tms_stime) +
+                    (nice_time_end_ - nice_time_start_) +
+                    (irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) +
+                    (steal_end_ - steal_start_);
+  float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_);
+  cpu_utilization = busy_time / (busy_time + idle_time);
+#else
+  LOG(WARNING)
+      << "Current System is not supported to get system cpu utilization"
+      << cpu_utilization << std::endl;
+#endif
+  return cpu_utilization;
+}
+
+float CpuUtilization::GetCpuCurProcessUtilization() {
+  float cpu_process_utilization = 0.0;
+#ifdef _MSC_VER
+  uint64_t process_user_time_start = FileTimeToUint64(process_user_time_start_);
+  uint64_t process_user_time_end = FileTimeToUint64(process_user_time_end_);
+  uint64_t process_kernel_time_start =
+      FileTimeToUint64(process_kernel_time_start_);
+  uint64_t process_kernel_time_end = FileTimeToUint64(process_kernel_time_end_);
+  uint64_t start = FileTimeToUint64(start_);
+  uint64_t end = FileTimeToUint64(end_);
+  float busy_time = (process_kernel_time_end - process_kernel_time_start) +
+                    (process_user_time_end - process_user_time_start);
+  cpu_process_utilization = busy_time / (end - start);
+  LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl;
+#elif defined(__linux__)
+  float busy_time =
+      (process_tms_end_.tms_utime - process_tms_start_.tms_utime) +
+      (process_tms_end_.tms_stime - process_tms_start_.tms_stime);
+  cpu_process_utilization = busy_time / (end_ - start_);
+#else
+  LOG(WARNING)
+      << "Current System is not supported to get process cpu utilization"
+      << cpu_process_utilization << std::endl;
+#endif
+  return cpu_process_utilization;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.h b/paddle/fluid/platform/profiler/cpu_utilization.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b05a6302cdb0628ad526b3fc6ae18fcb8df619b
--- /dev/null
+++ b/paddle/fluid/platform/profiler/cpu_utilization.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdio.h>
+#include <cinttypes>
+#include <cstdint>
+#include "glog/logging.h"
+#ifdef _MSC_VER
+#include <windows.h>
+#else
+#include <sys/times.h>
+#include <unistd.h>
+#endif
+
+namespace paddle {
+namespace platform {
+
+class CpuUtilization {
+ public:
+  CpuUtilization() {}
+  void RecordBeginTimeInfo();
+  void RecordEndTimeInfo();
+  float GetCpuUtilization();
+  float GetCpuCurProcessUtilization();
+
+ private:
+#ifdef _MSC_VER
+  FILETIME start_, end_;
+  FILETIME process_user_time_start_, process_user_time_end_;
+  FILETIME process_kernel_time_start_, process_kernel_time_end_;
+  FILETIME system_user_time_start_, system_user_time_end_;
+  FILETIME system_kernel_time_start_, system_kernel_time_end_;
+  FILETIME system_idle_time_start_, system_idle_time_end_;
+  FILETIME process_creation_time_, process_exit_time_;
+#else
+  clock_t start_, end_;
+  uint64_t idle_start_, idle_end_;
+  uint64_t iowait_start_, iowait_end_;
+  uint64_t nice_time_start_, nice_time_end_;
+  uint64_t irq_start_, irq_end_;
+  uint64_t softirq_start_, softirq_end_;
+  uint64_t steal_start_, steal_end_;
+  struct tms system_tms_start_, system_tms_end_;
+  struct tms process_tms_start_, process_tms_end_;
+#endif
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
index 4d3b807aba82ea91770dddfcf655ec2431cdb197..da12dccb74924fd27dee3047d29636341f7c47a2 100644
--- a/paddle/fluid/platform/profiler/cupti_data_process.cc
+++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/platform/profiler/cupti_data_process.h"
 #include <cstdio>
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
 namespace paddle {
@@ -26,7 +27,7 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel, uint64_t start_ns,
     return;
   }
   DeviceTraceEvent event;
-  event.name = kernel->name;
+  event.name = demangle(kernel->name);
   event.type = TracerEventType::Kernel;
   event.start_ns = kernel->start;
   event.end_ns = kernel->end;
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
old mode 100755
new mode 100644
diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h
index 54c5b219310a9c64214e721f2f6b310e20c5d733..fcaba9a43ca9385ab38e440f7b8659298a02ef05 100644
--- a/paddle/fluid/platform/profiler/event_tracing.h
+++ b/paddle/fluid/platform/profiler/event_tracing.h
@@ -21,26 +21,55 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+// Default tracing level.
+// It is Recommended to set the level explicitly.
 static constexpr uint32_t kDefaultTraceLevel = 4;
-// CPU event tracing. A trace marks something that happens but has no duration
+
+// Host event tracing. A trace marks something that happens but has no duration
 // associated with it. For example, thread starts working.
 // Chrome Trace Viewer Format: Instant Event
 struct RecordInstantEvent {
+  /**
+   * @param name: It is the caller's reponsibility to manage the underlying
+   * storage. RecordInstantEvent stores the pointer.
+   * @param type: Classification which is used to instruct the profiling
+   * data statistics.
+   * @param level: Used to filter events, works like glog VLOG(level).
+   * RecordEvent will works if HostTraceLevel >= level.
+   */
   explicit RecordInstantEvent(const char* name, TracerEventType type,
                               uint32_t level = kDefaultTraceLevel);
 };
 
-// CPU event tracing. A trace starts when an object of this clas is created and
+// Host event tracing. A trace starts when an object of this clas is created and
 // stops when the object is destroyed.
 // Chrome Trace Viewer Format: Duration Event/Complte Event
 class RecordEvent {
  public:
+  /**
+   * @param name: If your string argument has a longer lifetime (e.g.: string
+   * literal, static variables, etc) than the event, use 'const char* name'.
+   * Do your best to avoid using 'std::string' as the argument type. It will
+   * cause deep-copy to harm performance.
+   * @param type: Classification which is used to instruct the profiling
+   * data statistics.
+   * @param level: Used to filter events, works like glog VLOG(level).
+   * RecordEvent will works if HostTraceLevel >= level.
+   */
   explicit RecordEvent(
       const std::string& name,
       const TracerEventType type = TracerEventType::UserDefined,
       uint32_t level = kDefaultTraceLevel,
       const EventRole role = EventRole::kOrdinary);
 
+  /**
+   * @param name: It is the caller's reponsibility to manage the underlying
+   * storage. RecordEvent stores the pointer.
+   * @param type: Classification which is used to instruct the profiling
+   * data statistics.
+   * @param level: Used to filter events, works like glog VLOG(level).
+   * RecordEvent will works if HostTraceLevel >= level.
+   */
   explicit RecordEvent(const char* name, const TracerEventType type =
                                              TracerEventType::UserDefined,
                        uint32_t level = kDefaultTraceLevel,
diff --git a/paddle/fluid/platform/profiler/extra_info.h b/paddle/fluid/platform/profiler/extra_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..04532592ebd30793d7707e03b96c07c8e4dc4b1e
--- /dev/null
+++ b/paddle/fluid/platform/profiler/extra_info.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/platform/profiler/utils.h"
+
+namespace paddle {
+namespace platform {
+
+class ExtraInfo {
+ public:
+  ExtraInfo() {}
+  template <typename... Args>
+  void AddExtraInfo(const std::string& key, const std::string& format,
+                    Args... args);
+  void Clear() { extra_info_.clear(); }
+  std::unordered_map<std::string, std::string> GetExtraInfo() {
+    return extra_info_;
+  }
+
+ private:
+  std::unordered_map<std::string, std::string> extra_info_;
+};
+
+template <typename... Args>
+void ExtraInfo::AddExtraInfo(const std::string& key, const std::string& format,
+                             Args... args) {
+  std::string value = string_format(format, args...);
+  extra_info_[key] = value;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index 3bcd68c55963082bfc0ce12bbcdc0b07a05bbe97..49f9362527591744dd0685375e0244673a7b3081 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -202,7 +202,7 @@ class ThreadEventRecorder {
 
   ThreadEventSection GatherEvents() {
     ThreadEventSection thr_sec;
-    thr_sec.thread_name = thread_name_;
+    thr_sec.thread_name = GetCurrentThreadName();
     thr_sec.thread_id = thread_id_;
     thr_sec.events = std::move(base_evt_cntr_.Reduce());
     return thr_sec;
@@ -210,7 +210,6 @@ class ThreadEventRecorder {
 
  private:
   uint64_t thread_id_;
-  std::string thread_name_;
   EventContainer<CommonEvent> base_evt_cntr_;
 };
 
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index 2172fe4d1e3d5786492ea8741b5e50146648e59d..afd1c2b3012d46100dbead81792108cffb52e9a3 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -14,9 +14,16 @@
 
 #include "paddle/fluid/platform/profiler/host_tracer.h"
 #include "glog/logging.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
 #include "paddle/fluid/platform/profiler/host_event_recorder.h"
 
+// Used to filter events, works like glog VLOG(level).
+// RecordEvent will works if host_trace_level >= level.
+PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 2,
+                             "RecordEvent will works "
+                             "if host_trace_level >= level.");
+
 namespace paddle {
 namespace platform {
 
@@ -26,6 +33,9 @@ void ProcessHostEvents(const HostEventSection& host_events,
                        TraceEventCollector* collector) {
   for (const auto& thr_sec : host_events.thr_sections) {
     uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
     for (const auto& evt : thr_sec.events) {
       HostTraceEvent event;
       event.name = evt.name;
@@ -41,12 +51,18 @@ void ProcessHostEvents(const HostEventSection& host_events,
 
 }  // namespace
 
+void HostTracer::PrepareTracing() {
+  // warm up
+  HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
+  state_ = TracerState::READY;
+}
+
 void HostTracer::StartTracing() {
   PADDLE_ENFORCE_EQ(
       state_ == TracerState::READY || state_ == TracerState::STOPED, true,
       platform::errors::PreconditionNotMet("TracerState must be READY"));
   HostEventRecorder::GetInstance().GatherEvents();
-  HostTraceLevel::GetInstance().SetLevel(trace_level_);
+  HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
   state_ = TracerState::STARTED;
 }
 
diff --git a/paddle/fluid/platform/profiler/host_tracer.h b/paddle/fluid/platform/profiler/host_tracer.h
index b6c10e558b787cd84e760fb892bd75ebace90c3c..d05e829357f884b45cffae8cee7f7b627a366359 100644
--- a/paddle/fluid/platform/profiler/host_tracer.h
+++ b/paddle/fluid/platform/profiler/host_tracer.h
@@ -45,9 +45,9 @@ struct HostTracerOptions {
 
 class HostTracer : public TracerBase {
  public:
-  explicit HostTracer(const HostTracerOptions& options) {
-    trace_level_ = options.trace_level;
-  }
+  explicit HostTracer(const HostTracerOptions& options) : options_(options) {}
+
+  void PrepareTracing() override;
 
   void StartTracing() override;
 
@@ -56,7 +56,7 @@ class HostTracer : public TracerBase {
   void CollectTraceData(TraceEventCollector* collector) override;
 
  private:
-  uint32_t trace_level_;
+  HostTracerOptions options_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/output_logger.h b/paddle/fluid/platform/profiler/output_logger.h
old mode 100755
new mode 100644
index ff4effad5ecc414e70b99b0cd996c5ea402c7e3a..05a68cf2a4a8debf482cd9226f1226d3679f62a1
--- a/paddle/fluid/platform/profiler/output_logger.h
+++ b/paddle/fluid/platform/profiler/output_logger.h
@@ -33,7 +33,6 @@ class BaseLogger {
   virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
   virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
   virtual void LogNodeTrees(const NodeTrees&) {}
-  virtual void LogMetaInfo() {}
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index de5a0cc9be4ede29ac70409edaac5541c53c5c96..4fc1c6daf96c7f30cbd549b23b1a8f23563bc590 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -23,11 +23,13 @@
 #include "paddle/fluid/platform/profiler/event_node.h"
 #include "paddle/fluid/platform/profiler/tracer_base.h"
 
+DECLARE_int64(host_trace_level);
+
 namespace paddle {
 namespace platform {
 
 struct ProfilerOptions {
-  uint32_t trace_level = 0;
+  uint32_t trace_level = FLAGS_host_trace_level;
 };
 
 class Profiler {
diff --git a/paddle/fluid/platform/profiler/test_extra_info.cc b/paddle/fluid/platform/profiler/test_extra_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7274c9de977e982fa0c00015cafe07ff1d0a9d54
--- /dev/null
+++ b/paddle/fluid/platform/profiler/test_extra_info.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
+
+using paddle::platform::ExtraInfo;
+
+TEST(ExtraInfoTest, case0) {
+  ExtraInfo instance;
+  instance.AddExtraInfo(std::string("info1"), std::string("%d"), 20);
+  instance.AddExtraInfo(std::string("info2"), std::string("%s"), "helloworld");
+  std::unordered_map<std::string, std::string> map = instance.GetExtraInfo();
+  EXPECT_EQ(map["info1"], "20");
+  EXPECT_EQ(map["info2"], "helloworld");
+  EXPECT_EQ(map.size(), 2u);
+  instance.Clear();
+  map = instance.GetExtraInfo();
+  EXPECT_EQ(map.size(), 0u);
+}
diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h
index 30b32220d9f845e5c03e058b72224194bf769b76..cc85a178d14e57c1e1523e794f0016afb5714299 100644
--- a/paddle/fluid/platform/profiler/trace_event_collector.h
+++ b/paddle/fluid/platform/profiler/trace_event_collector.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <list>
+#include <string>
+#include <unordered_map>
 #include "paddle/fluid/platform/profiler/trace_event.h"
 
 namespace paddle {
@@ -32,6 +34,10 @@ class TraceEventCollector {
     device_events_.push_back(event);
   }
 
+  void AddThreadName(uint64_t tid, const std::string& name) {
+    thread_names_[tid] = name;
+  }
+
   const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
 
   const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
@@ -42,7 +48,12 @@ class TraceEventCollector {
     return device_events_;
   }
 
+  const std::unordered_map<uint64_t, std::string>& ThreadNames() const {
+    return thread_names_;
+  }
+
  private:
+  std::unordered_map<uint64_t, std::string> thread_names_;
   std::list<HostTraceEvent> host_events_;
   std::list<RuntimeTraceEvent> runtime_events_;
   std::list<DeviceTraceEvent> device_events_;
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b43389866c7a8150846bef874f49bd72907f446f
--- /dev/null
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler/utils.h"
+
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/dynload/cupti.h"
+
+namespace paddle {
+namespace platform {
+#ifdef PADDLE_WITH_CUPTI
+float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
+                            int32_t StaticSharedMemory,
+                            int32_t DynamicSharedMemory, int32_t BlockX,
+                            int32_t BlockY, int32_t BlockZ, float BlocksPerSm) {
+  float occupancy = 0.0;
+  std::vector<int> device_ids = GetSelectedDevices();
+  if (DeviceId < device_ids.size()) {
+    const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId);
+    cudaOccFuncAttributes occFuncAttr;
+    occFuncAttr.maxThreadsPerBlock = INT_MAX;
+    occFuncAttr.numRegs = RegistersPerThread;
+    occFuncAttr.sharedSizeBytes = StaticSharedMemory;
+    occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
+    occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
+    occFuncAttr.maxDynamicSharedSizeBytes = 0;
+    const cudaOccDeviceState occDeviceState = {};
+    int blockSize = BlockX * BlockY * BlockZ;
+    size_t dynamicSmemSize = DynamicSharedMemory;
+    cudaOccResult occ_result;
+    cudaOccDeviceProp prop(device_property);
+    cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
+        &occ_result, &prop, &occFuncAttr, &occDeviceState, blockSize,
+        dynamicSmemSize);
+    if (status == CUDA_OCC_SUCCESS) {
+      if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) {
+        BlocksPerSm = occ_result.activeBlocksPerMultiprocessor;
+      }
+      occupancy =
+          BlocksPerSm * blockSize /
+          static_cast<float>(device_property.maxThreadsPerMultiProcessor);
+    } else {
+      LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
+                   << status << std::endl;
+    }
+  }
+  return occupancy;
+}
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index 04014b972c3e3599beef0a60635fa122a153233f..cd56d343842686abc31343effc93cf1a4887411c 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <ctime>
+#include <string>
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
 namespace paddle {
@@ -42,5 +45,11 @@ static std::string GetStringFormatLocalTime() {
 
 static int64_t nsToUs(int64_t ns) { return ns / 1000; }
 
+#ifdef PADDLE_WITH_CUPTI
+float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
+                            int32_t staticSharedMemory,
+                            int32_t dynamicSharedMemory, int32_t blockX,
+                            int32_t blockY, int32_t blockZ, float blocksPerSm);
+#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 49690d1c66be74090c684d09f50e6c0d7b67d787..6f714a677033bb87d1a221f62baffa1112726571 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -59,7 +59,7 @@ struct Transform {
                   BinaryOperation op);
 };
 
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
+// NOTE: After the phi kernel is migrated, it needs to be deleted.
 template <>
 struct Transform<platform::CPUDeviceContext> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 3453cff30f5ad2d1016dcd786733a7024ed0ae4a..1f06eda8a2ee5dc8322b5e16e1f7eb2e0703f9a8 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool fleet_executor global_utils pten_utils)
+  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -37,6 +37,10 @@ if (WITH_ASCEND_CL)
   set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
 endif()
 
+if (WITH_CNCL)
+  set(PYBIND_DEPS ${PYBIND_DEPS} cncl_context)
+endif()
+
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
   set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
@@ -73,8 +77,17 @@ set(PYBIND_SRCS
   compatible.cc
   io.cc
   generator_py.cc
+  communication.cc
   cuda_streams_py.cc)
 
+if(NOT ON_INFER)
+  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup)
+  if (WITH_NCCL)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
+  endif()
+  set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
+endif()
+
 if(WITH_ASCEND)
   set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper)
   set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc)
@@ -133,6 +146,10 @@ if(WITH_PYTHON)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS hccl_context)
   endif(WITH_ASCEND_CL)
 
+  if(WITH_CNCL)
+    list(APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context)
+  endif(WITH_CNCL)
+
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
   add_executable(eager_op_function_generator eager_op_function_generator.cc)
@@ -282,7 +299,7 @@ if(WITH_PYTHON)
   if(NOT ON_INFER)
     cc_library(paddle_eager
     SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python)
+    DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python)
     add_dependencies(paddle_eager eager_codegen)
     add_dependencies(paddle_eager eager_op_function_generator_cmd)
     list(APPEND PYBIND_DEPS paddle_eager)
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 7bb7f03983eb9e8c88f46174a40664f1110682d1..b29cc10e8f56f5698874db8b357621aa4a88b238 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -24,10 +24,41 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/place.h"
+#include "pybind11/pybind11.h"
 
 namespace py = pybind11;
 
+namespace pybind11 {
+namespace detail {
+
+// Note: use same enum number of float16 in numpy.
+// import numpy as np
+// print np.dtype(np.float16).num  # 23
+constexpr int NPY_FLOAT16_ = 23;
+
+// Note: Since float16 is not a builtin type in C++, we register
+// paddle::platform::float16 as numpy.float16.
+// Ref: https://github.com/pybind/pybind11/issues/1776
+template <>
+struct npy_format_descriptor<paddle::platform::float16> {
+  static py::dtype dtype() {
+    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_);
+    return reinterpret_borrow<py::dtype>(ptr);
+  }
+  static std::string format() {
+    // Note: "e" represents float16.
+    // Details at:
+    // https://docs.python.org/3/library/struct.html#format-characters.
+    return "e";
+  }
+  static constexpr auto name = _("float16");
+};
+
+}  // namespace detail
+}  // namespace pybind11
+
 namespace paddle {
 namespace pybind {
 
@@ -175,6 +206,7 @@ void BindFleetExecutor(py::module* m) {
       .def(py::init(&DistModelDataBufCreate<int32_t>))
       .def(py::init(&DistModelDataBufCreate<int64_t>))
       .def(py::init(&DistModelDataBufCreate<float>))
+      .def(py::init(&DistModelDataBufCreate<paddle::platform::float16>))
       .def("reset",
            [](DistModelDataBuf& self, std::vector<float>& data) {
              self.Resize(data.size() * sizeof(float));
@@ -183,29 +215,35 @@ void BindFleetExecutor(py::module* m) {
       .def("reset", &DistModelDataBufReset<int32_t>)
       .def("reset", &DistModelDataBufReset<int64_t>)
       .def("reset", &DistModelDataBufReset<float>)
+      .def("reset", &DistModelDataBufReset<paddle::platform::float16>)
       .def("length", &DistModelDataBuf::length)
-      .def("tolist",
-           [](DistModelDataBuf& self, const std::string& dtype) -> py::list {
-             py::list l;
-             if (dtype == "int32") {
-               auto* data = static_cast<int32_t*>(self.data());
-               auto size = self.length() / sizeof(int32_t);
-               l = py::cast(std::vector<int32_t>(data, data + size));
-             } else if (dtype == "int64") {
-               auto* data = static_cast<int64_t*>(self.data());
-               auto size = self.length() / sizeof(int64_t);
-               l = py::cast(std::vector<int64_t>(data, data + size));
-             } else if (dtype == "float32") {
-               auto* data = static_cast<float*>(self.data());
-               auto size = self.length() / sizeof(float);
-               l = py::cast(std::vector<float>(data, data + size));
-             } else {
-               PADDLE_THROW(platform::errors::Unimplemented(
-                   "Unsupported data type. Now only supports INT32, INT64 and "
-                   "FLOAT32."));
-             }
-             return l;
-           });
+      .def("tolist", [](DistModelDataBuf& self,
+                        const std::string& dtype) -> py::list {
+        py::list l;
+        if (dtype == "int32") {
+          auto* data = static_cast<int32_t*>(self.data());
+          auto size = self.length() / sizeof(int32_t);
+          l = py::cast(std::vector<int32_t>(data, data + size));
+        } else if (dtype == "int64") {
+          auto* data = static_cast<int64_t*>(self.data());
+          auto size = self.length() / sizeof(int64_t);
+          l = py::cast(std::vector<int64_t>(data, data + size));
+        } else if (dtype == "float32") {
+          auto* data = static_cast<float*>(self.data());
+          auto size = self.length() / sizeof(float);
+          l = py::cast(std::vector<float>(data, data + size));
+        } else if (dtype == "float16") {
+          auto* data = static_cast<paddle::platform::float16*>(self.data());
+          auto size = self.length() / sizeof(paddle::platform::float16);
+          l = py::cast(
+              std::vector<paddle::platform::float16>(data, data + size));
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported data type. Now only supports INT32, INT64, "
+              "FLOAT16 and FLOAT32."));
+        }
+        return l;
+      });
 
   py::class_<DistModelTensor>(*m, "DistModelTensor")
       .def(py::init<>())
@@ -221,6 +259,10 @@ void BindFleetExecutor(py::module* m) {
            py::arg("name") = "",
            py::arg("lod") = std::vector<std::vector<size_t>>(),
            py::arg("copy") = true)
+      .def(py::init(&DistModelTensorCreate<paddle::platform::float16>),
+           py::arg("data"), py::arg("name") = "",
+           py::arg("lod") = std::vector<std::vector<size_t>>(),
+           py::arg("copy") = true)
       .def_readwrite("name", &DistModelTensor::name)
       .def_readwrite("shape", &DistModelTensor::shape)
       .def_readwrite("data", &DistModelTensor::data)
@@ -231,7 +273,8 @@ void BindFleetExecutor(py::module* m) {
   py::enum_<DistModelDataType>(*m, "DistModelDataType")
       .value("FLOAT32", DistModelDataType::FLOAT32)
       .value("INT64", DistModelDataType::INT64)
-      .value("INT32", DistModelDataType::INT32);
+      .value("INT32", DistModelDataType::INT32)
+      .value("FLOAT16", DistModelDataType::FLOAT16);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0d2777f825dc592e19230bc2ba4412f943d0c2b
--- /dev/null
+++ b/paddle/fluid/pybind/communication.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <Python.h>
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+#include <chrono>
+#include <string>
+
+#include "paddle/fluid/distributed/store/tcp_store.h"
+#include "paddle/fluid/pybind/communication.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+using TCPStore = paddle::distributed::TCPStore;
+
+void BindTCPStore(py::module* m) {
+  py::class_<TCPStore>(*m, "TCPStore")
+      .def(
+          py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>())
+      .def("add", &TCPStore::add)
+      .def("get", &TCPStore::get);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/operators/size_op.cu b/paddle/fluid/pybind/communication.h
similarity index 63%
rename from paddle/fluid/operators/size_op.cu
rename to paddle/fluid/pybind/communication.h
index de56ecd95270577689f699462b9273b43f34595e..17045ccfe65cae25471ceff3abf0129b2a21acb0 100644
--- a/paddle/fluid/operators/size_op.cu
+++ b/paddle/fluid/pybind/communication.h
@@ -12,11 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/size_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    size, paddle::operators::SizeKernel<int>,
-    paddle::operators::SizeKernel<int64_t>,
-    paddle::operators::SizeKernel<paddle::platform::float16>,
-    paddle::operators::SizeKernel<float>, paddle::operators::SizeKernel<bool>,
-    paddle::operators::SizeKernel<double>);
+#pragma once
+
+#include <Python.h>
+
+#include "pybind11/chrono.h"
+#include "pybind11/complex.h"
+#include "pybind11/functional.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindTCPStore(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e057fb53ccecc7193fd52b8beda2c4f2880560e8
--- /dev/null
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fcntl.h>
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/pybind/distributed_py.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/phi/api/all.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#endif
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+using Tensor = paddle::experimental::Tensor;
+
+void BindDistributed(py::module *m) {
+  py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
+      .value("SUM", distributed::ReduceOp::SUM)
+      .value("AVG", distributed::ReduceOp::AVG)
+      .value("MAX", distributed::ReduceOp::MAX)
+      .value("MIN", distributed::ReduceOp::MIN)
+      .value("PRODUCT", distributed::ReduceOp::PRODUCT);
+
+  py::class_<distributed::AllreduceOptions>(*m, "AllreduceOptions")
+      .def(py::init<>())
+      .def_readwrite("reduce_op", &distributed::AllreduceOptions::reduce_op);
+
+  py::class_<distributed::BroadcastOptions>(*m, "BroadcastOptions")
+      .def(py::init<>())
+      .def_readwrite("source_rank", &distributed::BroadcastOptions::source_rank)
+      .def_readwrite("source_root",
+                     &distributed::BroadcastOptions::source_root);
+
+  auto ProcessGroup =
+      py::class_<distributed::ProcessGroup,
+                 std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
+          .def("rank", &distributed::ProcessGroup::GetRank)
+          .def("size", &distributed::ProcessGroup::GetSize)
+          .def("name", &distributed::ProcessGroup::GetBackendName)
+          .def("allreduce",
+               [](distributed::ProcessGroup &self, py::handle py_tensor,
+                  distributed::ReduceOp op) {
+                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                 distributed::AllreduceOptions opts;
+                 opts.reduce_op = op;
+                 std::vector<Tensor> tensors = {tensor};
+                 return self.AllReduce(tensors, opts);
+               },
+               py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM,
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("broadcast",
+               [](distributed::ProcessGroup &self, py::handle py_tensor,
+                  int source_rank) {
+                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                 distributed::BroadcastOptions opts;
+                 opts.source_rank = source_rank;
+                 std::vector<Tensor> tensors = {tensor};
+                 return self.Broadcast(tensors, opts);
+               },
+               py::arg("tensor"), py::arg("source_rank"),
+               py::call_guard<py::gil_scoped_release>());
+
+#if defined(PADDLE_WITH_NCCL)
+  py::class_<distributed::ProcessGroupNCCL,
+             std::shared_ptr<distributed::ProcessGroupNCCL>>(
+      *m, "ProcessGroupNCCL", ProcessGroup)
+      .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
+           py::call_guard<py::gil_scoped_release>());
+
+  py::class_<distributed::ProcessGroup::Task,
+             std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
+      .def("is_completed", &distributed::ProcessGroup::Task::IsCompleted)
+      .def("wait", &distributed::ProcessGroup::Task::Wait,
+           py::arg("timeout") = kWaitTimeout,
+           py::call_guard<py::gil_scoped_release>())
+      .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
+           py::call_guard<py::gil_scoped_release>());
+#endif
+
+  // define parallel strategy, it will be removed
+  py::class_<distributed::ProcessGroupStrategy> pg_strategy(
+      *m, "ProcessGroupStrategy", "");
+  pg_strategy.def(py::init())
+      .def_property("nranks",
+                    [](const distributed::ProcessGroupStrategy &self) {
+                      return self.nranks_;
+                    },
+                    [](distributed::ProcessGroupStrategy &self, int nranks) {
+                      self.nranks_ = nranks;
+                    })
+      .def_property("local_rank",
+                    [](const distributed::ProcessGroupStrategy &self) {
+                      return self.local_rank_;
+                    },
+                    [](distributed::ProcessGroupStrategy &self,
+                       int local_rank) { self.local_rank_ = local_rank; })
+      .def_property(
+          "trainer_endpoints",
+          [](const distributed::ProcessGroupStrategy &self) {
+            return self.trainer_endpoints_;
+          },
+          [](distributed::ProcessGroupStrategy &self,
+             std::vector<std::string> eps) { self.trainer_endpoints_ = eps; })
+      .def_property("current_endpoint",
+                    [](const distributed::ProcessGroupStrategy &self) {
+                      return self.current_endpoint_;
+                    },
+                    [](distributed::ProcessGroupStrategy &self,
+                       const std::string &ep) { self.current_endpoint_ = ep; })
+      .def_property("nrings",
+                    [](const distributed::ProcessGroupStrategy &self) {
+                      return self.nrings_;
+                    },
+                    [](distributed::ProcessGroupStrategy &self, int nrings) {
+                      self.nrings_ = nrings;
+                    });
+}
+
+}  // end namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/distributed_py.h b/paddle/fluid/pybind/distributed_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..be5c7549b8e8d8f5d3ad91e90ed43112a664d339
--- /dev/null
+++ b/paddle/fluid/pybind/distributed_py.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/chrono.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindDistributed(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 241e9f9058dfe35fa36df12515e3ffc1a2f38a6b..1052f93d32ec3cb626577c4b584cc6172c83da2e 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -50,7 +50,6 @@ PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
   if (obj) {
     auto v = reinterpret_cast<TensorObject*>(obj);
     new (&(v->tensor)) paddle::experimental::Tensor();
-    Py_INCREF(obj);
   }
   return obj;
 }
@@ -58,43 +57,37 @@ PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
 // TODO(jiabin): Overload this once we need more constructor in Python
 void EmptyTensorInitializer(TensorObject* self, const std::string& name,
                             const paddle::platform::Place& place,
-                            bool persistable = false, bool stop_gradient = true,
+                            bool persistable = false, int stop_gradient = -1,
                             framework::proto::VarType::Type dtype =
                                 paddle::framework::proto::VarType::FP32,
                             const std::vector<int>& dims = {},
                             framework::proto::VarType::Type var_type =
                                 paddle::framework::proto::VarType::LOD_TENSOR) {
   auto ddims = phi::make_ddim(dims);
-  PADDLE_ENFORCE_GE(
-      phi::product(ddims), 0,
-      paddle::platform::errors::InvalidArgument(
-          "Create Eager Tensor with dims contain minus num is ilegal"
-          "Please check your code and make sure you new a "
-          "eager tensor with fixed shape instead of using -1."));
   self->tensor.set_name(name);
   auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor));
   autograd_meta->SetPersistable(persistable);
-  autograd_meta->SetStopGradient(stop_gradient);
+  if (stop_gradient != -1) {
+    autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
+  }
   if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) {
     // TODO(jiabin): Maybe support LOD later
     std::shared_ptr<phi::DenseTensor> dense_tensor =
         std::make_shared<phi::DenseTensor>(
             phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-            phi::DenseTensorMeta(paddle::framework::TransToPtenDataType(dtype),
+            phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
                                  ddims));
-    dense_tensor->mutable_data(place);
+    if (phi::product(ddims) > 0) {
+      dense_tensor->mutable_data(place);
+    }
     self->tensor.set_impl(dense_tensor);
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "We only support LoDTensor to be constructed by this initializer, "
-        "please check your var type first and make sure you are going to "
-        "construct LoDTensor."));
   }
 
   if (!autograd_meta->GetMutableGradNode()) {
     VLOG(3) << "Tensor(" << name
             << ") have not GradNode, add GradNodeAccumulation for it.";
-    autograd_meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+    autograd_meta->SetGradNode(
+        std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
   }
 }
 
@@ -140,16 +133,15 @@ void InitTensorWithTensor(TensorObject* self,
     VLOG(4) << "Same place, do ShareDataWith";
   } else {
     self->tensor.set_impl(
-        src.copy_to(phi::TransToPtenBackend(place), true).impl());
+        src.copy_to(phi::TransToPhiBackend(place), true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
-  egr::EagerUtils::autograd_meta(&(self->tensor))->SetStopGradient(true);
   if (src.get_autograd_meta()) {
-    egr::EagerUtils::unsafe_autograd_meta(self->tensor)
+    egr::EagerUtils::autograd_meta(&(self->tensor))
         ->SetPersistable(
             egr::EagerUtils::unsafe_autograd_meta(src)->Persistable());
   } else {
-    egr::EagerUtils::unsafe_autograd_meta(self->tensor)->SetPersistable(false);
+    egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
   }
 }
 
@@ -165,11 +157,10 @@ void InitTensorWithFrameworkTensor(TensorObject* self,
     auto temp =
         paddle::experimental::Tensor(std::make_shared<phi::DenseTensor>(src));
     self->tensor.set_impl(
-        temp.copy_to(phi::TransToPtenBackend(place), true).impl());
+        temp.copy_to(phi::TransToPhiBackend(place), true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
-  egr::EagerUtils::autograd_meta(&(self->tensor))->SetStopGradient(true);
-  egr::EagerUtils::unsafe_autograd_meta(self->tensor)->SetPersistable(false);
+  egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
 }
 
 py::object ParsePyArray(
@@ -218,21 +209,18 @@ paddle::platform::Place ParsePlace(
 }
 
 // boolean arguments: zero_copy, stop_gradient, persistable
-bool ParseBooleanArgs(std::string key,
-                      std::unordered_map<std::string, PyObject*> kws_map,
-                      std::unordered_map<std::string, Py_ssize_t> kw_order_map,
-                      PyObject* args, bool flag_kwargs, Py_ssize_t args_num) {
-  bool res = false;
-  if (key == "stop_gradient") res = true;
+int ParseBooleanArgs(std::string key,
+                     std::unordered_map<std::string, PyObject*> kws_map,
+                     std::unordered_map<std::string, Py_ssize_t> kw_order_map,
+                     PyObject* args, bool flag_kwargs, Py_ssize_t args_num) {
+  int res = -1;
 
   if (kw_order_map[key] <= args_num) {
-    res = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, kw_order_map[key] - 1),
-                                kw_order_map[key] - 1);
+    res = static_cast<int>(CastPyArg2AttrBoolean(
+        PyTuple_GET_ITEM(args, kw_order_map[key] - 1), kw_order_map[key] - 1));
   } else {
     if (flag_kwargs && kws_map[key] != NULL) {
-      res = CastPyArg2AttrBoolean(kws_map[key], 0);
-    } else {
-      return res;
+      res = static_cast<int>(CastPyArg2AttrBoolean(kws_map[key], 0));
     }
   }
   return res;
@@ -288,15 +276,15 @@ void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr,
   bool persistable = false;
   bool zero_copy = false;
   std::string act_name = "";
-  bool stop_gradient = true;
+  int stop_gradient = -1;
 
   numpy_value =
       ParsePyArray(kws_map, kw_order_map, args, flag_kwargs, args_num);
   place = ParsePlace(kws_map, kw_order_map, args, flag_kwargs, args_num);
-  persistable = ParseBooleanArgs("persistable", kws_map, kw_order_map, args,
-                                 flag_kwargs, args_num);
-  zero_copy = ParseBooleanArgs("zero_copy", kws_map, kw_order_map, args,
-                               flag_kwargs, args_num);
+  persistable = (1 == ParseBooleanArgs("persistable", kws_map, kw_order_map,
+                                       args, flag_kwargs, args_num));
+  zero_copy = (1 == ParseBooleanArgs("zero_copy", kws_map, kw_order_map, args,
+                                     flag_kwargs, args_num));
   act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num);
   stop_gradient = ParseBooleanArgs("stop_gradient", kws_map, kw_order_map, args,
                                    flag_kwargs, args_num);
@@ -571,7 +559,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
         EmptyTensorInitializer(py_tensor_ptr, act_name,
                                egr::Controller::Instance().GetExpectedPlace(),
                                persistable,
-                               /* stop_gradient */ true, dtype, dims, var_type);
+                               /* stop_gradient */ -1, dtype, dims, var_type);
 
         return 0;
       } else {
@@ -655,7 +643,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
         bool persistable = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
         EmptyTensorInitializer(py_tensor_ptr, act_name,
                                egr::Controller::Instance().GetExpectedPlace(),
-                               persistable, true, dtype, dims, var_type);
+                               persistable, -1, dtype, dims, var_type);
         return 0;
       } else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
         VLOG(6) << "Calling case3's initializer.";
@@ -726,9 +714,8 @@ PyMappingMethods mapping_methods;
 void BindEager(pybind11::module* module) {
   auto m = module->def_submodule("eager");
 
-  auto& internals = pybind11::detail::get_internals();
   auto heap_type = reinterpret_cast<PyHeapTypeObject*>(
-      internals.default_metaclass->tp_alloc(internals.default_metaclass, 0));
+      PyType_Type.tp_alloc(&PyType_Type, 0));
   heap_type->ht_name = ToPyObject("Tensor");
   heap_type->ht_qualname = ToPyObject("Tensor");
   auto type = &heap_type->ht_type;
@@ -742,8 +729,8 @@ void BindEager(pybind11::module* module) {
   type->tp_getset = variable_properties;
   type->tp_init = TensorInit;
   type->tp_new = TensorNew;
-  Py_INCREF(internals.instance_base);
-  type->tp_base = reinterpret_cast<PyTypeObject*>(internals.instance_base);
+  Py_INCREF(&PyBaseObject_Type);
+  type->tp_base = reinterpret_cast<PyTypeObject*>(&PyBaseObject_Type);
   type->tp_flags |=
       Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
 #if PY_VERSION_HEX >= 0x03050000
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index b825e9265a8cd8b080df7fd316b33007c2445384..0b04dc7347ce78f87d6f8d81e30eb4135fd965ed 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -135,7 +135,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
 
-  dst = src.copy_to(phi::TransToPtenBackend(place), blocking);
+  dst = src.copy_to(phi::TransToPhiBackend(place), blocking);
   egr::EagerUtils::autograd_meta(&dst)->SetStopGradient(
       egr::EagerUtils::autograd_meta(&(src))->StopGradient());
   egr::EagerUtils::autograd_meta(&dst)->SetPersistable(
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index b1aef8fc08fea818045a97e29e5b4f2d0e30e222..f11a2ab2517fb481f184c9b68b2558c999d88ec9 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -35,6 +37,82 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
+namespace py = ::pybind11;
+
+class PyTensorHook : public egr::TensorHook {
+ public:
+  explicit PyTensorHook(PyObject* func) : py_func_(func) {
+    Py_INCREF(py_func_);
+  }
+
+  ~PyTensorHook() {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(py_func_);
+  }
+
+  paddle::experimental::Tensor operator()(
+      const paddle::experimental::Tensor& var) override {
+    py::gil_scoped_acquire gil;
+    VLOG(3) << "Call PyTensorHook for var " << var.name();
+
+    PyObject* res = nullptr;
+    try {
+      res = PyObject_CallFunctionObjArgs(py_func_, ToPyObject(var), nullptr);
+    } catch (platform::EnforceNotMet& e) {
+      throw std::move(e);
+    } catch (std::exception& e) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Hook function of Tensor raises an exception: %s.", e.what()));
+    } catch (...) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Hook function of Tensor raises an unknown exception."));
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(res,
+                            platform::errors::Unavailable(
+                                "Hook function of Tensor return a nullptr."));
+    if (res == Py_None) {
+      return var;
+    }
+    return reinterpret_cast<TensorObject*>(res)->tensor;
+  }
+
+ private:
+  PyObject* py_func_;
+};
+
+class PyTensorVoidHook : public egr::TensorVoidHook {
+ public:
+  explicit PyTensorVoidHook(PyObject* func) : py_func_(func) {
+    Py_INCREF(py_func_);
+  }
+
+  ~PyTensorVoidHook() {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(py_func_);
+  }
+
+  void operator()() override {
+    py::gil_scoped_acquire gil;
+    VLOG(3) << "Call PyTensorVoidHook";
+
+    try {
+      PyObject_CallFunctionObjArgs(py_func_, nullptr);
+    } catch (platform::EnforceNotMet& e) {
+      throw std::move(e);
+    } catch (std::exception& e) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Hook function of Tensor raises an exception: %s.", e.what()));
+    } catch (...) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Hook function of Tensor raises an unknown exception."));
+    }
+  }
+
+ private:
+  PyObject* py_func_;
+};
+
 extern void InitTensorWithNumpyValue(TensorObject* self,
                                      const pybind11::object& array,
                                      bool zero_copy);
@@ -113,7 +191,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
   auto cp_tensor =
-      self->tensor.copy_to(phi::TransToPtenBackend(place), blocking);
+      self->tensor.copy_to(phi::TransToPhiBackend(place), blocking);
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
   egr::EagerUtils::autograd_meta(&cp_tensor)
       ->SetPersistable(
@@ -177,7 +255,7 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args,
     if (!meta->GetMutableGradNode()) {
       VLOG(6) << "Make grad node of tensor: " << self->tensor.name()
               << "become accumulation node";
-      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
     }
     egr::egr_utils_api::RetainGradForTensor(self->tensor);
   }
@@ -186,36 +264,51 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* tensor__clear_gradient(TensorObject* self, PyObject* args,
-                                        PyObject* kwargs) {
+static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
+                                       PyObject* kwargs) {
   EAGER_TRY
   VLOG(4) << "ClearGradient " << self->tensor.name();
 
+  Py_ssize_t args_num = PyTuple_Size(args);
+  bool set_to_zero = true;
+  if (args_num == (Py_ssize_t)1) {
+    CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
+  }
+
   paddle::experimental::Tensor* grad;
   if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
-    // Add RetainGrad as PostHook to AccumulationNode
-    std::shared_ptr<egr::GradNodeBase> grad_node =
-        egr::EagerUtils::grad_node(self->tensor);
-    PADDLE_ENFORCE(
-        grad_node.get() != nullptr,
-        paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                        "Leaf tensor should have had grad_node "
-                                        "with type: GradNodeAccumulation"));
-    auto accumulation_grad_node =
-        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-    grad = accumulation_grad_node->Grad();
+    grad = egr::EagerUtils::mutable_grad(self->tensor);
+    PADDLE_ENFORCE(grad != nullptr,
+                   paddle::platform::errors::Fatal(
+                       "Detected NULL grad"
+                       "Please check if you have manually cleared"
+                       "the grad inside autograd_meta"));
   } else {
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor);
     grad = meta->MutableGrad();
   }
 
-  if (grad->initialized()) {
-    VLOG(4) << "Gradient of " << self->tensor.name()
-            << " is initialized, will be released.";
-    auto dense_tensor =
-        std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
-    dense_tensor->MoveMemoryHolder();
+  if (grad->is_selected_rows()) {
+    auto selected_rows =
+        std::dynamic_pointer_cast<phi::SelectedRows>(grad->impl());
+    if (selected_rows->mutable_value()->IsInitialized()) {
+      selected_rows->mutable_rows()->clear();
+      selected_rows->mutable_value()->clear();
+    }
+  } else if (grad->is_dense_tensor()) {
+    if (grad->initialized()) {
+      if (set_to_zero) {
+        grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
+      } else {
+        VLOG(4) << "Gradient of " << self->tensor.name()
+                << " is initialized, will be released.";
+        auto dense_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
+        dense_tensor->MoveMemoryHolder();
+      }
+    }
   }
+
   Py_INCREF(Py_None);
   return Py_None;
   EAGER_CATCH_AND_THROW_RETURN_NULL
@@ -228,19 +321,15 @@ static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args,
 
   if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
     // Add RetainGrad as PostHook to AccumulationNode
-    std::shared_ptr<egr::GradNodeBase> grad_node =
-        egr::EagerUtils::grad_node(self->tensor);
-    PADDLE_ENFORCE(
-        grad_node.get() != nullptr,
-        paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                        "Leaf tensor should have had grad_node "
-                                        "with type: GradNodeAccumulation"));
-    auto accumulation_grad_node =
-        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-    if (accumulation_grad_node->Grad()->initialized()) {
-      accumulation_grad_node->Grad()->set_impl(
-          paddle::experimental::zeros_like(*(accumulation_grad_node->Grad()))
-              .impl());
+    paddle::experimental::Tensor* grad =
+        egr::EagerUtils::mutable_grad(self->tensor);
+    PADDLE_ENFORCE(grad != nullptr,
+                   paddle::platform::errors::Fatal(
+                       "Detected NULL grad"
+                       "Please check if you have manually cleared"
+                       "the grad inside autograd_meta"));
+    if (grad->initialized()) {
+      grad->set_impl(paddle::experimental::zeros_like(*(grad)).impl());
     }
   } else {
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor);
@@ -392,6 +481,92 @@ static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args,
+                                           PyObject* kwargs) {
+  EAGER_TRY
+  int64_t hook_id;
+  if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
+    VLOG(6) << "Register hook for leaf tensor: " << self->tensor.name();
+    std::shared_ptr<egr::GradNodeBase> grad_node =
+        egr::EagerUtils::grad_node(self->tensor);
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node,"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation."));
+    auto rank_info =
+        egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo();
+
+    PyObject* hook_func = PyTuple_GET_ITEM(args, 0);
+
+    auto accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    hook_id = accumulation_grad_node->RegisterGradientHook(
+        rank_info.first, rank_info.second,
+        std::make_shared<PyTensorHook>(hook_func));
+
+  } else {
+    VLOG(6) << "Register hook for non leaf tensor: " << self->tensor.name();
+    std::shared_ptr<egr::GradNodeBase> grad_node =
+        egr::EagerUtils::grad_node(self->tensor);
+    auto rank_info =
+        egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo();
+
+    PyObject* hook_func = PyTuple_GET_ITEM(args, 0);
+
+    hook_id = grad_node->RegisterGradientHook(
+        rank_info.first, rank_info.second,
+        std::make_shared<PyTensorHook>(hook_func));
+  }
+  return ToPyObject(hook_id);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_remove_grad_hook(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  VLOG(6) << "Remove the registered hook for tensor: " << self->tensor.name();
+  std::shared_ptr<egr::GradNodeBase> grad_node =
+      egr::EagerUtils::grad_node(self->tensor);
+
+  int64_t hook_id = pybind::CastPyArg2AttrLong(PyTuple_GET_ITEM(args, 0), 0);
+
+  return ToPyObject(grad_node->RemoveGradientHook(hook_id));
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  VLOG(4) << "Register reduce hook for tensor: " << self->tensor.name();
+
+  std::shared_ptr<egr::GradNodeBase> grad_node =
+      egr::EagerUtils::grad_node(self->tensor);
+  PADDLE_ENFORCE_EQ(egr::egr_utils_api::IsLeafTensor(self->tensor), true,
+                    platform::errors::InvalidArgument(
+                        "Only can register backward hook for leaf Tensor."));
+  PADDLE_ENFORCE_EQ(
+      !egr::EagerUtils::unsafe_autograd_meta(self->tensor)->StopGradient(),
+      true, platform::errors::InvalidArgument(
+                "Cannot register backward hook on a Tensor that stop "
+                "gradient."));
+  PADDLE_ENFORCE(
+      grad_node.get() != nullptr,
+      paddle::platform::errors::Fatal("Detected NULL grad_node,"
+                                      "Leaf tensor should have had grad_node "
+                                      "with type: GradNodeAccumulation."));
+  PyObject* hook_func = PyTuple_GET_ITEM(args, 0);
+
+  auto accumulation_grad_node =
+      std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+  accumulation_grad_node->RegisterReduceHook(
+      std::make_shared<PyTensorVoidHook>(hook_func));
+
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -407,7 +582,7 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"retain_grads", (PyCFunction)(void (*)(void))tensor_retain_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_clear_gradient", (PyCFunction)(void (*)(void))tensor__clear_gradient,
+    {"clear_gradient", (PyCFunction)(void (*)(void))tensor_clear_gradient,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_zero_grads", (PyCFunction)(void (*)(void))tensor__zero_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -429,6 +604,14 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_register_grad_hook",
+     (PyCFunction)(void (*)(void))tensor_register_grad_hook,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_remove_grad_hook", (PyCFunction)(void (*)(void))tensor_remove_grad_hook,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_register_backward_hook",
+     (PyCFunction)(void (*)(void))tensor_register_reduce_hook,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 4fe47d5a8427d11f560e73990ea8bad7bae7a929..c15c171799f4421fc3e8b40a84abdbb062709dc7 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -32,7 +32,7 @@
 #endif
 #include "paddle/fluid/pybind/op_function_generator.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 // clang-format off
@@ -365,9 +365,9 @@ GenerateOpFunctions() {
     auto& op_type = op_proto->type();
     // Skip ooerator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
-    // if the pten lib contains op kernel, we still generate ops method
+    // if the phi lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
-        !phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+        !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) {
       continue;
     }
     std::string func_name = "eager_api_" + op_type;
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 43cfb50f2afe11a131e4bd71862b9efa84c841a9..2e1390cb96155c4832a8ceace889e331039ed43f 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -70,26 +70,13 @@ PyObject* tensor_properties_get_stop_gradient(TensorObject* self,
 
 PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
   EAGER_TRY
-  if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
-    std::shared_ptr<egr::GradNodeBase> grad_node =
-        egr::EagerUtils::grad_node(self->tensor);
-    PADDLE_ENFORCE(
-        grad_node.get() != nullptr,
-        paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                        "Leaf tensor should have had grad_node "
-                                        "with type: GradNodeAccumulation"));
-    auto accumulation_grad_node =
-        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-    return ToPyObject(*accumulation_grad_node->Grad());
+  VLOG(6) << "Get grad for tensor: " << self->tensor.name();
+  auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor);
+  if (meta) {
+    return ToPyObject(meta->Grad());
   } else {
-    VLOG(6) << "Get grad for tensor: " << self->tensor.name();
-    auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor);
-    if (meta) {
-      return ToPyObject(meta->Grad());
-    } else {
-      Py_INCREF(Py_None);
-      return Py_None;
-    }
+    Py_INCREF(Py_None);
+    return Py_None;
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -101,16 +88,15 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value,
   PADDLE_ENFORCE(
       egr::egr_utils_api::IsLeafTensor(self->tensor),
       paddle::platform::errors::Fatal("Only leaf Tensor can be set grad."));
-  std::shared_ptr<egr::GradNodeBase> grad_node =
-      egr::EagerUtils::grad_node(self->tensor);
-  PADDLE_ENFORCE(
-      grad_node.get() != nullptr,
-      paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                      "Leaf tensor should have had grad_node "
-                                      "with type: GradNodeAccumulation"));
-  auto accumulation_grad_node =
-      std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-  accumulation_grad_node->Grad()->copy_(src, true);
+
+  paddle::experimental::Tensor* grad =
+      egr::EagerUtils::mutable_grad(self->tensor);
+  PADDLE_ENFORCE(grad != nullptr,
+                 paddle::platform::errors::Fatal(
+                     "Detected NULL grad"
+                     "Please check if you have manually cleared"
+                     "the grad inside autograd_meta"));
+  grad->copy_(src, true);
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 9c033376d6c439f8a89a13fecf7bb968706504ef..c1e8822eec22179266d69d3b97890aebe678b187 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -555,6 +555,32 @@ PyObject* ToPyObject(
   return dict;
 }
 
+// For Final State Dygraph,
+// We directly use paddle::optional(Tensor) as dispensable Tensor
+paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (PyTuple_Check(obj)) {
+    obj = PyTuple_GET_ITEM(obj, 0);
+  }
+
+  if (obj == nullptr || obj == Py_None) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    return {};
+  }
+
+  return paddle::make_optional<paddle::experimental::Tensor>(
+      reinterpret_cast<TensorObject*>(obj)->tensor);
+}
+
+// For Intermediate State Dygraph,
+// we use an uninitialized Tensor to represent dispensable Tensor
 paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
                                                 const std::string& arg_name,
                                                 PyObject* args, ssize_t arg_idx,
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index fb19e108aeb7035a52b708358539fa198643f9db..0c721d6124791edda7f41d46dcbbbfcccc80fb95 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -89,10 +89,15 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
 paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
                                                 const std::string& arg_name,
                                                 PyObject* args, ssize_t arg_idx,
                                                 bool dispensable = false);
+
 std::vector<paddle::experimental::Tensor> GetTensorListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
@@ -102,6 +107,7 @@ paddle::experimental::Tensor* GetTensorPtrFromArgs(const std::string& op_type,
                                                    PyObject* args,
                                                    ssize_t arg_idx,
                                                    bool dispensable = false);
+
 std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 73c8f362d145db078ac4c84c91372dcdd61c47af..3145a9cf7655c053c269990e00982226eae49c7a 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -103,11 +103,13 @@ void BindCommunicatorContext(py::module* m) {
           py::init<const std::string&, const std::vector<std::string>&,
                    const std::vector<std::string>&, const std::vector<int64_t>&,
                    const std::vector<std::string>&, int, bool, bool, bool, int,
-                   bool>())
+                   bool, bool, int64_t>())
       .def("var_name", [](const CommContext& self) { return self.var_name; })
       .def("trainer_id",
            [](const CommContext& self) { return self.trainer_id; })
       .def("table_id", [](const CommContext& self) { return self.table_id; })
+      .def("program_id",
+           [](const CommContext& self) { return self.program_id; })
       .def("split_varnames",
            [](const CommContext& self) { return self.splited_varnames; })
       .def("split_endpoints",
@@ -122,6 +124,8 @@ void BindCommunicatorContext(py::module* m) {
            [](const CommContext& self) { return self.origin_varnames; })
       .def("is_tensor_table",
            [](const CommContext& self) { return self.is_tensor_table; })
+      .def("is_datanorm_table",
+           [](const CommContext& self) { return self.is_datanorm_table; })
       .def("__str__", [](const CommContext& self) { return self.print(); });
 }
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 387addda9edd1fd011281545f423527cac6d8bd6..8c5ed2d11830195a6fb70c54d12c9ef3eb3fc8b2 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/bkcl_context.h"
+#include "paddle/fluid/imperative/cncl_context.h"
 #include "paddle/fluid/imperative/data_loader.h"
 #include "paddle/fluid/imperative/gloo_context.h"
 #include "paddle/fluid/imperative/hccl_context.h"
@@ -2559,6 +2560,18 @@ void BindImperative(py::module *m_ptr) {
            py::arg("ring_id"));
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+  py::class_<imperative::CNCLParallelContext, imperative::ParallelContext,
+             std::shared_ptr<imperative::CNCLParallelContext>>(
+      m, "CNCLParallelContext")
+      .def(py::init<const imperative::ParallelStrategy &,
+                    const platform::MLUPlace &>())
+      .def("init", [](imperative::CNCLParallelContext &self) { self.Init(); })
+      .def("init_with_ring_id",
+           &imperative::CNCLParallelContext::InitWithRingID,
+           py::arg("ring_id"));
+#endif
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
   py::class_<imperative::HeterParallelContext, imperative::ParallelContext,
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index 14e4fac7cdd95ac3b33d64741c4b2f461a7225be..8283a249ded4c0c790add73573621252bc8954d8 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -15,7 +15,7 @@
 #include <string>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/phi/core/compat/op_utils.h"
 #include "paddle/phi/core/kernel_factory.h"
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index cbbe56985b2adaab0a4a33214132066332cdcd79..9d5bcfac494cba0c550cf7f2751f485b689473b9 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -32,7 +32,7 @@
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #endif
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 // NOTE(pangyoki): Inplace OP with duplicable input.
@@ -400,9 +400,9 @@ GenerateOpFunctions() {
     auto& op_type = op_proto->type();
     // Skip operator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
-    // if the pten lib contains op kernel, we still generate ops method
+    // if the phi lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
-        !phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+        !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) {
       continue;
     }
 
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index 629dc2c4037e73b3dfd76126e14bb34c985e38ce..e8c338b3fd18801356e2f5474dc80e0150c40dce 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -44,8 +44,6 @@ void BindPSGPUWrapper(py::module* m) {
       .def("set_slot_offset_vector",
            &framework::PSGPUWrapper::SetSlotOffsetVector,
            py::call_guard<py::gil_scoped_release>())
-      .def("init_GPU_server", &framework::PSGPUWrapper::InitializeGPUServer,
-           py::call_guard<py::gil_scoped_release>())
       .def("set_date", &framework::PSGPUWrapper::SetDate,
            py::call_guard<py::gil_scoped_release>())
       .def("set_dataset", &framework::PSGPUWrapper::SetDataset,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f653070b2eff7765aa4359a8405e1f27c6addf0b..6e553ad2e60e292881fa8bb0294ea2a247656b67 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -50,8 +50,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/save_load_util.h"
 #include "paddle/fluid/framework/scope_pool.h"
@@ -78,6 +78,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/distributed_py.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
 #ifndef PADDLE_ON_INFERENCE
@@ -91,6 +92,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/bind_cost_model.h"
 #include "paddle/fluid/pybind/bind_fleet_executor.h"
 #include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/communication.h"
 #include "paddle/fluid/pybind/compatible.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/data_set_py.h"
@@ -462,7 +464,7 @@ static void inline CreateVariableIfNotExit(
         tensor_temp->Resize(phi::make_ddim(var_desc.GetShape()));
         tensor_temp->mutable_data(
             exe->GetPlace(),
-            framework::TransToPtenDataType(var_desc.GetDataType()));
+            framework::TransToPhiDataType(var_desc.GetDataType()));
       }
     }
   } else {
@@ -669,60 +671,60 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_get_use_default_grad_op_desc_maker_ops",
         [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); });
 
-  m.def(
-      "_get_all_register_op_kernels",
-      [](const std::string &lib) {
-        std::unordered_map<std::string, std::vector<std::string>>
-            all_kernels_info;
-        if (lib == "fluid" || lib == "all") {
-          auto &all_kernels =
-              paddle::framework::OperatorWithKernel::AllOpKernels();
-
-          for (auto &kernel_pair : all_kernels) {
-            auto op_type = kernel_pair.first;
-            std::vector<std::string> kernel_types;
-            for (auto &info_pair : kernel_pair.second) {
-              paddle::framework::OpKernelType kernel_type = info_pair.first;
-              kernel_types.emplace_back(
-                  paddle::framework::KernelTypeToString(kernel_type));
+  m.def("_get_all_register_op_kernels",
+        [](const std::string &lib) {
+          std::unordered_map<std::string, std::vector<std::string>>
+              all_kernels_info;
+          if (lib == "fluid" || lib == "all") {
+            auto &all_kernels =
+                paddle::framework::OperatorWithKernel::AllOpKernels();
+
+            for (auto &kernel_pair : all_kernels) {
+              auto op_type = kernel_pair.first;
+              std::vector<std::string> kernel_types;
+              for (auto &info_pair : kernel_pair.second) {
+                paddle::framework::OpKernelType kernel_type = info_pair.first;
+                kernel_types.emplace_back(
+                    paddle::framework::KernelTypeToString(kernel_type));
+              }
+              all_kernels_info.emplace(op_type, kernel_types);
             }
-            all_kernels_info.emplace(op_type, kernel_types);
           }
-        }
-        if (lib == "pten" || lib == "all") {
-          auto pten_kernels = phi::KernelFactory::Instance().kernels();
-          for (auto &kernel_pair : pten_kernels) {
-            auto op_type = phi::TransToFluidOpName(kernel_pair.first);
-            std::vector<std::string> kernel_types;
-            for (auto &info_pair : kernel_pair.second) {
-              framework::OpKernelType kernel_type =
-                  framework::TransPtenKernelKeyToOpKernelType(info_pair.first);
-              auto kernel_type_str = framework::KernelTypeToString(kernel_type);
-              if (all_kernels_info.count(op_type)) {
-                if (std::find(all_kernels_info[op_type].begin(),
-                              all_kernels_info[op_type].end(),
-                              kernel_type_str) ==
-                    all_kernels_info[op_type].end()) {
-                  all_kernels_info[op_type].emplace_back(kernel_type_str);
+          if (lib == "phi" || lib == "all") {
+            auto phi_kernels = phi::KernelFactory::Instance().kernels();
+            for (auto &kernel_pair : phi_kernels) {
+              auto op_type = phi::TransToFluidOpName(kernel_pair.first);
+              std::vector<std::string> kernel_types;
+              for (auto &info_pair : kernel_pair.second) {
+                framework::OpKernelType kernel_type =
+                    framework::TransPhiKernelKeyToOpKernelType(info_pair.first);
+                auto kernel_type_str =
+                    framework::KernelTypeToString(kernel_type);
+                if (all_kernels_info.count(op_type)) {
+                  if (std::find(all_kernels_info[op_type].begin(),
+                                all_kernels_info[op_type].end(),
+                                kernel_type_str) ==
+                      all_kernels_info[op_type].end()) {
+                    all_kernels_info[op_type].emplace_back(kernel_type_str);
+                  }
+                } else {
+                  kernel_types.emplace_back(kernel_type_str);
                 }
-              } else {
-                kernel_types.emplace_back(kernel_type_str);
               }
-            }
-            if (!kernel_types.empty()) {
-              all_kernels_info.emplace(op_type, kernel_types);
+              if (!kernel_types.empty()) {
+                all_kernels_info.emplace(op_type, kernel_types);
+              }
             }
           }
-        }
 
-        return all_kernels_info;
-      },
-      py::arg("lib") = "all",
-      R"DOC(
+          return all_kernels_info;
+        },
+        py::arg("lib") = "all",
+        R"DOC(
            Return the registered kernels in paddle.
 
            Args:
-               lib[string]: the libarary, could be 'pten', 'fluid' and 'all'.
+               lib[string]: the libarary, could be 'phi', 'fluid' and 'all'.
            )DOC");
 
   // NOTE(zjl): ctest would load environment variables at the beginning even
@@ -821,39 +823,39 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::CPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::XPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::CUDAPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::CUDAPinnedPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::MLUPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_clear", &framework::Tensor::clear)
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::NPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_copy_from", &TensorCopyFrom<paddle::platform::CPUPlace>,
            py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
@@ -2621,6 +2623,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindGlobalValueGetterSetter(&m);
   BindProcessMeshDesc(&m);
   BindFleetExecutor(&m);
+  BindTCPStore(&m);
 
   py::class_<framework::LoDRankTable>(m, "LodRankTable")
       .def("items", [](framework::LoDRankTable &table) {
@@ -3783,86 +3786,142 @@ All parameter, weight, gradient are variables in Paddle.
 
 #ifdef PADDLE_WITH_IPU
   py::class_<platform::ipu::IpuBackend,
-             std::shared_ptr<platform::ipu::IpuBackend>>(m, "IpuBackend")
-      .def(py::init(&platform::ipu::IpuBackend::GetNewInstance))
-      .def("clear", &platform::ipu::IpuBackend::Clear)
+             std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>>(
+      m, "IpuBackend")
+      // manage IpuBackend in C++
+      .def("get_instance",
+           []() {
+             return std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>(
+                 platform::ipu::IpuBackend::GetInstance());
+           },
+           py::return_value_policy::reference)
+      .def("detach", &platform::ipu::IpuBackend::Detach)
+      .def("reset", &platform::ipu::IpuBackend::Reset)
       .def("set_scope", &platform::ipu::IpuBackend::SetScope)
-      .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy);
-
-  py::class_<platform::ipu::IpuStrategy> ipu_strategy(m, "IpuStrategy");
-  ipu_strategy.def(py::init())
-      .def_property(
-          "num_ipus",
-          [](const platform::ipu::IpuStrategy &self) { return self.num_ipus; },
-          [](platform::ipu::IpuStrategy &self, int num_ipus) {
-            self.num_ipus = num_ipus;
-          })
-      .def_property(
-          "accumulationFactor",
-          [](const platform::ipu::IpuStrategy &self) {
-            return self.popart_options_.accumulationFactor;
-          },
-          [](platform::ipu::IpuStrategy &self, int accumulationFactor) {
-            self.popart_options_.accumulationFactor = accumulationFactor;
-          })
-      .def_property("batches_per_step",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.batches_per_step;
-                    },
-                    [](platform::ipu::IpuStrategy &self, int batches_per_step) {
-                      self.batches_per_step = batches_per_step;
-                    })
-      .def_property("is_training",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.is_training;
-                    },
-                    [](platform::ipu::IpuStrategy &self, bool is_training) {
-                      self.is_training = is_training;
-                    })
-      .def_property(
-          "enable_pipelining",
-          [](const platform::ipu::IpuStrategy &self) {
-            return self.popart_options_.enablePipelining;
-          },
-          [](platform::ipu::IpuStrategy &self, bool enable_pipelining) {
-            self.popart_options_.enablePipelining = enable_pipelining;
-          })
-      .def_property(
-          "enable_manual_shard",
-          [](const platform::ipu::IpuStrategy &self) {
-            return self.popart_options_.virtualGraphMode ==
-                   platform::ipu::VirtualGraphMode::Manual;
-          },
-          [](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) {
-            if (enable_ipu_shard) {
-              self.popart_options_.virtualGraphMode =
-                  platform::ipu::VirtualGraphMode::Manual;
-            } else {
-              self.popart_options_.virtualGraphMode =
-                  platform::ipu::VirtualGraphMode::Off;
-            }
-          })
-      .def_property("need_avg_shard",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.need_avg_shard;
-                    },
-                    [](platform::ipu::IpuStrategy &self, bool need_avg_shard) {
-                      self.need_avg_shard = need_avg_shard;
-                    })
-      .def_property("batch_size",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.batch_size;
-                    },
-                    [](platform::ipu::IpuStrategy &self, int batch_size) {
-                      self.batch_size = batch_size;
-                    })
-      .def_property("enable_fp16",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.enable_fp16;
-                    },
-                    [](platform::ipu::IpuStrategy &self, bool enable_fp16) {
-                      self.enable_fp16 = enable_fp16;
-                    });
+      .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy)
+      .def("save_model_proto", &platform::ipu::IpuBackend::SaveModelProto);
+
+  py::class_<platform::ipu::IpuStrategy>(m, "IpuStrategy")
+      .def(py::init())
+      .def("set_options",
+           [](platform::ipu::IpuStrategy &self, const py::dict &opt) {
+             for (auto element : opt) {
+               auto option_name = element.first.cast<std::string>();
+               VLOG(10) << "Set option: " << option_name;
+               if (py::isinstance<py::bool_>(element.second)) {
+                 self.AddBoolOption(option_name, element.second.cast<bool>());
+               } else if (py::isinstance<py::float_>(element.second)) {
+                 self.AddDoubleOption(option_name,
+                                      element.second.cast<double>());
+               } else if (py::isinstance<py::int_>(element.second)) {
+                 self.AddUint64Option(option_name,
+                                      element.second.cast<std::uint64_t>());
+               } else if (py::isinstance<py::str>(element.second)) {
+                 self.AddStringOption(option_name,
+                                      element.second.cast<std::string>());
+               } else if (py::isinstance<py::set>(element.second) ||
+                          py::isinstance<py::list>(element.second)) {
+                 for (auto option : element.second.cast<py::list>()) {
+                   std::string option_val;
+                   if (py::isinstance<py::str>(option)) {
+                     option_val = option.cast<std::string>();
+                   } else if (py::isinstance<py::int_>(option)) {
+                     option_val = std::to_string(option.cast<std::uint64_t>());
+                   } else {
+                     PADDLE_THROW(platform::errors::Unimplemented(
+                         "Failed to convert type: %s when set IpuStrategy "
+                         "option: %s",
+                         option.get_type(), option_name));
+                   }
+                   self.InsertStringOption(option_name, option_val);
+                 }
+               } else if (py::isinstance<py::dict>(element.second)) {
+                 if (option_name.rfind("location_", 0) == 0) {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     self.SetTensorLocation(
+                         option_name, option.first.cast<std::string>(),
+                         option.second.cast<std::uint64_t>());
+                   }
+                 } else if (option_name == "custom_op") {
+                   std::string paddle_op;
+                   std::string popart_op;
+                   std::string domain;
+                   int version = -1;
+                   for (auto option : element.second.cast<py::dict>()) {
+                     std::string option_key = option.first.cast<std::string>();
+                     if (option_key == "paddle_op") {
+                       paddle_op = option.second.cast<std::string>();
+                     } else if (option_key == "popart_op") {
+                       popart_op = option.second.cast<std::string>();
+                     } else if (option_key == "domain") {
+                       domain = option.second.cast<std::string>();
+                     } else if (option_key == "version") {
+                       version = option.second.cast<int>();
+                     } else {
+                       PADDLE_THROW(platform::errors::InvalidArgument(
+                           "Invalid argument, key must be one of paddle_op, "
+                           "popart_op, domain or version, but revecived %s",
+                           option_key));
+                     }
+                   }
+                   self.AddCustomOp(paddle_op, popart_op, domain, version);
+                 } else {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     std::string option_key = option.first.cast<std::string>();
+                     std::string option_val;
+                     if (py::isinstance<py::str>(option.second)) {
+                       option_val = option.second.cast<std::string>();
+                     } else if (py::isinstance<py::int_>(option.second)) {
+                       option_val =
+                           std::to_string(option.second.cast<std::uint64_t>());
+                     } else {
+                       PADDLE_THROW(platform::errors::Unimplemented(
+                           "Failed to convert value type: %s when set "
+                           "IpuStrategy option: %s",
+                           option.second.get_type(), option_key));
+                     }
+                     self.InsertStringPairOption(option_name, option_key,
+                                                 option_val);
+                   }
+                 }
+               } else {
+                 PADDLE_THROW(platform::errors::InvalidArgument(
+                     "Invalid IpuStrategy option value type: %s, please check "
+                     "input value for option: %s",
+                     element.second.get_type(), option_name));
+               }
+             }
+           })
+      .def("get_option",
+           [](platform::ipu::IpuStrategy &self, const std::string &name) {
+             py::dict res;
+             auto option_type = self.GetOptionType(name);
+             res["name"] = name;
+             res["type"] = option_type;
+             if (option_type == "vector") {
+               auto value = self.GetVectorOption(name);
+               res["value"] = value;
+             } else if (option_type == "map") {
+               auto value = self.GetMapOption(name);
+               res["value"] = value;
+             } else {
+               auto value_s = self.GetOption(name);
+               res["value_s"] = value_s;
+               if (option_type == "bool") {
+                 res["value"] = static_cast<bool>(std::stoi(value_s));
+               } else if (option_type == "uint64") {
+                 res["value"] = std::stoul(value_s);
+               } else if (option_type == "double") {
+                 res["value"] = std::stod(value_s);
+               } else if (option_type == "string") {
+                 res["value"] = value_s;
+               }
+             }
+             return res;
+           })
+      .def("enable_pattern", &platform::ipu::IpuStrategy::EnablePattern)
+      .def("disable_pattern", &platform::ipu::IpuStrategy::DisablePattern)
+      .def("is_pattern_enabled", &platform::ipu::IpuStrategy::IsPatternEnabled);
 #endif
 
   BindFleetWrapper(&m);
@@ -3893,6 +3952,9 @@ All parameter, weight, gradient are variables in Paddle.
   BindCompatible(&m);
   BindDataset(&m);
   BindGenerator(&m);
+#ifndef PADDLE_ON_INFERENCE
+  BindDistributed(&m);
+#endif
 #ifdef PADDLE_WITH_ASCEND
   BindAscendWrapper(&m);
   BindAscendGraph(&m);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 531cc03f26714a6041c1e2c205640b9ea06c440c..e7abd64ec4439611c307440597c7278cabb03ab9 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -324,7 +324,7 @@ void SetTensorFromPyArrayT(
     if (zero_copy) {
       auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
       auto type = framework::ToDataType(std::type_index(typeid(T)));
-      self->ResetHolderWithType(holder, framework::TransToPtenDataType(type));
+      self->ResetHolderWithType(holder, framework::TransToPhiDataType(type));
     } else {
       auto dst = self->mutable_data<T>(place);
       std::memcpy(dst, array.data(), array.nbytes());
@@ -348,10 +348,16 @@ void SetTensorFromPyArrayT(
     if (zero_copy) {
       auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
       auto type = framework::ToDataType(std::type_index(typeid(T)));
-      self->ResetHolderWithType(holder, framework::TransToPtenDataType(type));
+      self->ResetHolderWithType(holder, framework::TransToPhiDataType(type));
     } else {
-      auto dst = self->mutable_data<T>(place);
-      std::memcpy(dst, array.data(), array.nbytes());
+      // IPU does not store Tensor data, Tensor will be created on CPU
+      if (!self->initialized()) {
+        auto dst = self->mutable_data<T>(place);
+        std::memcpy(dst, array.data(), array.nbytes());
+      } else {
+        auto dst = self->mutable_data<T>(self->place());
+        std::memcpy(dst, array.data(), array.nbytes());
+      }
     }
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
@@ -512,7 +518,7 @@ void SetUVATensorFromPyArray(
           cuda_device_pointer, need_allocate_size,
           platform::CUDAPlace(device_id));
   self_tensor->ResetHolderWithType(holder,
-                                   framework::TransToPtenDataType(data_type));
+                                   framework::TransToPhiDataType(data_type));
 #endif
 }
 
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 2486c54d5addc40fae2c019ab6b0db4d6121a290..f2768f3dfa88d3405008baa7662f5e209ca3954c 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -2,13 +2,13 @@ if (NOT WITH_INFRT)
     return()
 endif()
 
-option(INFRT_WITH_PTEN  "Compile INFRT with PTEN"    ON)
+option(INFRT_WITH_PHI  "Compile INFRT with PHI"    ON)
 
 #TODO(xiaowei) remove fluid
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 
-if (INFRT_WITH_PTEN)
-    add_definitions("-DINFRT_WITH_PTEN")
+if (INFRT_WITH_PHI)
+    add_definitions("-DINFRT_WITH_PHI")
 endif()
 
 # compile flags
@@ -74,6 +74,7 @@ endif()
 
 
 add_subdirectory(api)
+add_subdirectory(backends)
 add_subdirectory(common)
 add_subdirectory(dialect)
 add_subdirectory(host_context)
@@ -96,17 +97,19 @@ set(infrt_mlir_incs
         pd_extra_ops_inc
         rewrite_inc
         trt_ops_inc
+        pd_lower_to_trt_inc
         )
-if (INFRT_WITH_PTEN)
-    set(pten_libs pten)
+
+if (INFRT_WITH_PHI)
+    set(phi_libs phi)
     set(infrt_mlir_incs ${infrt_mlir_incs}
-        MLIRinfrt_pten_tensorIncGen
-        MLIRinfrt_pten_baseIncGen
+        MLIRinfrt_phi_tensorIncGen
+        MLIRinfrt_phi_baseIncGen
         )
 endif()
 
-cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${pten_libs} paddle_framework_proto infrt_naive)
-cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${pten_libs} paddle_framework_proto)
+cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
+cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
 add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
 
 add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS})
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index c2a4e0aff7a08e6b66fb2b2ce6f3165e1adcfd0a..28f63db49f4baec12bb43afa9034d5578d9f6cb1 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -42,7 +42,6 @@ using namespace infrt::host_context;  // NOLINT
 using namespace infrt::tensor;        // NOLINT
 using namespace infrt::tensor;        // NOLINT
 using infrt::dt::TensorMapType;       // NOLINT
-using infrt::dt::TensorType;          // NOLINT
 
 namespace infrt {
 
@@ -145,7 +144,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
 
     // process results
     auto& last_op = predict_func.front().back();
-    if (last_op.getName().getStringRef() == "infrt.return") {
+    if (last_op.getName().getStringRef() == "Infrt.return") {
       for (size_t i = 0; i < last_op.getNumOperands(); ++i) {
         auto* value = AddValue(mlir::Value(last_op.getOperand(i)));
         results_.push_back(ValueRef(value));
diff --git a/paddle/infrt/backends/CMakeLists.txt b/paddle/infrt/backends/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b639f892925685bd61f05a0a5db0f8af0f44070a
--- /dev/null
+++ b/paddle/infrt/backends/CMakeLists.txt
@@ -0,0 +1,3 @@
+if (INFRT_WITH_PHI AND WITH_GPU AND WITH_TENSORRT)
+  add_subdirectory(tensorrt)
+endif()
diff --git a/paddle/infrt/backends/host/pten_allocator.h b/paddle/infrt/backends/host/phi_allocator.h
similarity index 95%
rename from paddle/infrt/backends/host/pten_allocator.h
rename to paddle/infrt/backends/host/phi_allocator.h
index fa61e04fb670741c959c427d8d12c42fb1217251..c8f97e04a1b8376efbac749fffa70d77c7b95e72 100644
--- a/paddle/infrt/backends/host/pten_allocator.h
+++ b/paddle/infrt/backends/host/phi_allocator.h
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace infrt {
 namespace backends {
 
-class CpuPtenAllocator : public phi::Allocator {
+class CpuPhiAllocator : public phi::Allocator {
  public:
   static void deleter(phi::Allocation* ptr) { ::operator delete(ptr); }
 
diff --git a/paddle/infrt/backends/host/pten_context.h b/paddle/infrt/backends/host/phi_context.h
similarity index 94%
rename from paddle/infrt/backends/host/pten_context.h
rename to paddle/infrt/backends/host/phi_context.h
index 961c93529aeb44200f320d5804c561887257a4d6..9d0e3bc4fbb3158147283c1992cf1fee70c9b90d 100644
--- a/paddle/infrt/backends/host/pten_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace infrt {
 namespace backends {
 
-class CpuPtenContext : public phi::CPUContext {
+class CpuPhiContext : public phi::CPUContext {
  public:
   using Base = phi::CPUContext;
   using phi::CPUContext::SetEigenDevice;
diff --git a/paddle/infrt/backends/tensorrt/CMakeLists.txt b/paddle/infrt/backends/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cc20c9a2e14b639fbc02b74ab5870188d7e55d63
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_library(infrt_trt SRCS trt_engine.cc DEPS glog phi_dynload_cuda phi)
+
+cc_test_tiny(test_infrt_trt SRCS test_trt_engine.cc DEPS infrt_trt phi_dynload_cuda tensorrt_converter)
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54b7bc3e8af835077fcd2ac00d33b15e4ae3f95c
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -0,0 +1,254 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+const char* model_input = "model_input";
+const char* model_output = "model_output1";
+const char* model_output2 = "model_output2";
+
+TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
+    nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
+  TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  if (is_static_shape) {
+    network.reset(builder->createNetworkV2(0U));
+  } else {
+    auto networkFlags =
+        1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    network.reset(builder->createNetworkV2(networkFlags));
+  }
+
+  ITensor* data =
+      network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims);
+  CHECK_NOTNULL(data);
+  IActivationLayer* act =
+      network->addActivation(*data, ActivationType::kSIGMOID);
+  CHECK_NOTNULL(act);
+  auto* act_out = act->getOutput(0);
+  std::vector<int> output_length{1, 2};
+  int axis;
+  nvinfer1::IPluginV2Layer* split_layer;
+  if (is_static_shape) {
+    axis = 0;
+    paddle::inference::tensorrt::plugin::SplitPlugin plugin(
+        axis, output_length, false);
+    split_layer = network->addPluginV2(&act_out, 1, plugin);
+  } else {
+    axis = 1;
+    paddle::inference::tensorrt::plugin::SplitPluginDynamic plugin(
+        axis, output_length, false);
+    split_layer = network->addPluginV2(&act_out, 1, plugin);
+  }
+
+  split_layer->getOutput(0)->setName(model_output);
+  split_layer->getOutput(1)->setName(model_output2);
+  network->markOutput(*split_layer->getOutput(0));
+  network->markOutput(*split_layer->getOutput(1));
+  return network;
+}
+
+// sigmoid(x) = 1 / (1 + exp(-x))
+inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
+
+TEST(trt, run_static) {
+  TRTEngine static_trt_engine(0);
+  auto net = ConstructNetwork(
+      static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
+  BuildOptions static_build_options;
+  static_build_options.max_batch = 4;
+  static_trt_engine.Build(std::move(net), static_build_options);
+  InferenceOptions inference_options;
+  inference_options.batch = 2;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 3, 28, 28}));
+  phi::DenseTensor input;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 3 * 28 * 28, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  phi::DenseTensor output, output2;
+  std::unordered_map<std::string, phi::DenseTensor*> outputs;
+  outputs.emplace(std::make_pair(model_output, &output));
+  outputs.emplace(std::make_pair(model_output2, &output2));
+
+  static_trt_engine.SetUpInference(inference_options, inputs, &outputs);
+  static_trt_engine.GetEngineInfo();
+  static_trt_engine.Run(context);
+
+  std::vector<float> output_data1(inference_options.batch * 1 * 28 * 28, 0);
+  std::vector<float> output_data2(inference_options.batch * 2 * 28 * 28, 0);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data1.data(),
+                       place,
+                       output.data<float>(),
+                       sizeof(float) * output_data1.size(),
+                       context.stream());
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data2.data(),
+                       place,
+                       output2.data<float>(),
+                       sizeof(float) * output_data2.size(),
+                       context.stream());
+  cudaStreamSynchronize(context.stream());
+
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    int w = i % 28;
+    int h = (i / 28) % 28;
+    int c = i / (28 * 28) % 3;
+    int n = i / (28 * 28 * 3);
+    if (c == 0) {
+      CHECK_NEAR(
+          sigmoid(host_data[i]), output_data1[n * 28 * 28 + h * 28 + w], 1e-5);
+    } else {
+      CHECK_NEAR(sigmoid(host_data[i]),
+                 output_data2[n * 28 * 28 * 2 + (c - 1) * 28 * 28 + h * 28 + w],
+                 1e-5);
+    }
+  }
+}
+
+TEST(trt, run_dynamic) {
+  TRTEngine engine(0);
+  auto net = ConstructNetwork(
+      engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false);
+  BuildOptions build_options;
+  build_options.max_batch = 4;
+  build_options.workspace = 32;
+  // build_options.fp16 = true;
+  std::vector<int32_t> min_shape{1, 3, 16, 16};
+  std::vector<int32_t> opt_shape{2, 3, 28, 28};
+  std::vector<int32_t> max_shape{4, 3, 28, 28};
+  build_options.shapes[model_input][0] = min_shape;
+  build_options.shapes[model_input][1] = opt_shape;
+  build_options.shapes[model_input][2] = max_shape;
+  engine.Build(std::move(net), build_options);
+
+  InferenceOptions inference_options;
+  inference_options.batch = 2;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 3, 16, 16}));
+  phi::DenseTensor input, output, output2;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 3 * 16 * 16, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  std::unordered_map<std::string, phi::DenseTensor*> outputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  outputs.emplace(std::make_pair(model_output, &output));
+  outputs.emplace(std::make_pair(model_output2, &output2));
+
+  engine.SetUpInference(inference_options, inputs, &outputs);
+  engine.GetEngineInfo();
+  engine.Run(context);
+
+  std::vector<float> output_data1(inference_options.batch * 1 * 16 * 16, 0);
+  std::vector<float> output_data2(inference_options.batch * 2 * 16 * 16, 0);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data1.data(),
+                       place,
+                       output.data<float>(),
+                       sizeof(float) * output_data1.size(),
+                       context.stream());
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data2.data(),
+                       place,
+                       output2.data<float>(),
+                       sizeof(float) * output_data2.size(),
+                       context.stream());
+  cudaStreamSynchronize(context.stream());
+
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    int w = i % 16;
+    int h = (i / 16) % 16;
+    int c = i / (16 * 16) % 3;
+    int n = i / (16 * 16 * 3);
+    if (c == 0) {
+      CHECK_NEAR(
+          sigmoid(host_data[i]), output_data1[n * 16 * 16 + h * 16 + w], 1e-5);
+    } else {
+      CHECK_NEAR(sigmoid(host_data[i]),
+                 output_data2[n * 16 * 16 * 2 + (c - 1) * 16 * 16 + h * 16 + w],
+                 1e-5);
+    }
+  }
+}
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a204fe42b45080b0ba5526473622f34e4fe4ef41
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -0,0 +1,365 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include "glog/logging.h"
+#include "paddle/phi/backends/dynload/tensorrt.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+// The following two API are implemented in TensorRT's header file, cannot load
+// from the dynamic library. So create our own implementation and directly
+// trigger the method from the dynamic library.
+static nvinfer1::IBuilder* createInferBuilder(
+    nvinfer1::ILogger& logger) {  // NOLINT
+  return static_cast<nvinfer1::IBuilder*>(
+      phi::dynload::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+static nvinfer1::IRuntime* createInferRuntime(
+    nvinfer1::ILogger& logger) {  // NOLINT
+  return static_cast<nvinfer1::IRuntime*>(
+      phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+
+TRTEngine::TRTEngine(int device_id) : device_id_(device_id) {
+  FreshDeviceId();
+  logger_.reset(new TrtLogger());
+  builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
+  phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
+}
+
+nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() {
+  CHECK_NOTNULL(builder_);
+  return builder_.get();
+}
+
+void TRTEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+                      const BuildOptions& build_options) {
+  FreshDeviceId();
+  ModelToBuildEnv(std::move(network), build_options);
+  CHECK_NOTNULL(engine_);
+}
+
+bool TRTEngine::ModelToBuildEnv(
+    TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+    const BuildOptions& build) {
+  CHECK_NOTNULL(builder_);
+  std::swap(network, network_);
+  CHECK_NOTNULL(network_);
+  // ModelToNetwork(network_, logger);
+  NetworkToEngine(build);
+  return true;
+}
+
+bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
+  TrtUniquePtr<IBuilderConfig> config{builder_->createBuilderConfig()};
+  CHECK_NOTNULL(config);
+  CHECK(SetupNetworkAndConfig(build, *network_, *config));
+
+#if IS_TRT_VERSION_LT(8000)
+  engine_.reset(builder_->buildEngineWithConfig(*network_, *config));
+#else
+  serialized_engine_.reset(
+      builder_->buildSerializedNetwork(*network_, *config));
+  CHECK_NOTNULL(serialized_engine_);
+
+  TrtUniquePtr<IRuntime> runtime{createInferRuntime(logger_->GetTrtLogger())};
+  CHECK_NOTNULL(runtime);
+  engine_.reset(runtime->deserializeCudaEngine(serialized_engine_->data(),
+                                               serialized_engine_->size()));
+  CHECK_NOTNULL(engine_);
+#endif
+  return true;
+}
+
+bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
+                                      INetworkDefinition& network,
+                                      IBuilderConfig& config) {
+  builder_->setMaxBatchSize(build.max_batch);
+  // TODO(wilber): handle one engine - multi execution context case.
+  IOptimizationProfile* profile{nullptr};
+  if (!build.shapes.empty()) {
+    profile = builder_->createOptimizationProfile();
+    CHECK_NOTNULL(profile);
+  }
+
+  // Set formats and data types of inputs
+  for (int32_t i = 0; i < network.getNbInputs(); ++i) {
+    auto* input = network.getInput(i);
+    if (!build.input_formats.empty()) {
+      input->setType(build.input_formats[i].first);
+      input->setAllowedFormats(build.input_formats[i].second);
+    } else {
+      switch (input->getType()) {
+        case DataType::kINT32:
+        case DataType::kBOOL:
+        case DataType::kHALF:
+          // Leave these as is.
+          break;
+        case DataType::kFLOAT:
+        case DataType::kINT8:
+          // User did not specify a floating-point format.  Default to kFLOAT.
+          input->setType(DataType::kFLOAT);
+          break;
+      }
+      input->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
+    }
+
+    if (profile) {
+      Dims dims = input->getDimensions();
+      // TODO(wilber): shape tensor.
+      const bool is_dynamic_input = std::any_of(
+          dims.d, dims.d + dims.nbDims, [](int dim) { return dim == -1; });
+      if (is_dynamic_input) {
+        is_dynamic_shape_ = true;
+        auto shape = build.shapes.find(input->getName());
+
+        // If no shape is provided
+        if (shape == build.shapes.end()) {
+          // TODO(wilber): add infomation.
+          CHECK(false);
+        }
+        LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
+        std::vector<int> profile_dims{};
+        profile_dims =
+            shape->second[static_cast<size_t>(OptProfileSelector::kMIN)];
+        CHECK(profile->setDimensions(input->getName(),
+                                     OptProfileSelector::kMIN,
+                                     VecToDims(profile_dims)));
+        profile_dims =
+            shape->second[static_cast<size_t>(OptProfileSelector::kOPT)];
+        CHECK(profile->setDimensions(input->getName(),
+                                     OptProfileSelector::kOPT,
+                                     VecToDims(profile_dims)));
+        profile_dims =
+            shape->second[static_cast<size_t>(OptProfileSelector::kMAX)];
+        CHECK(profile->setDimensions(input->getName(),
+                                     OptProfileSelector::kMAX,
+                                     VecToDims(profile_dims)));
+      }
+    }
+  }
+
+  if (profile && is_dynamic_shape_) {
+    CHECK(profile->isValid());  // Required optimization profile is invalid
+    CHECK_NE(config.addOptimizationProfile(profile), -1);
+  }
+
+  // Set formats and data types of outputs
+  for (int32_t i = 0, n = network.getNbOutputs(); i < n; i++) {
+    auto* output = network.getOutput(i);
+    if (!build.output_formats.empty()) {
+      // int outputFormatIndex = broadcastOutputFormats ? 0 : i;
+      output->setType(build.output_formats[i].first);
+      output->setAllowedFormats(build.output_formats[i].second);
+    } else {
+      output->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
+    }
+  }
+
+  config.setMaxWorkspaceSize(static_cast<size_t>(build.workspace) << 20);
+
+  if (build.fp16) {
+    config.setFlag(BuilderFlag::kFP16);
+    bool support_fp16 = builder_->platformHasFastFp16();
+    if (support_fp16) {
+      LOG(INFO) << "Run INFRT-TRT FP16 mode";
+    } else {
+      LOG(INFO) << "You specify FP16 mode, but the hardware do not support "
+                   "FP16 speed up, use FP32 instead.";
+    }
+  }
+
+  if (build.tf32) {
+    config.setFlag(BuilderFlag::kTF32);
+    bool support_tf32 = builder_->platformHasTf32();
+    if (support_tf32) {
+      LOG(INFO) << "Run INFRT-TRT TF32 mode";
+    } else {
+      LOG(INFO) << "You specify TF32 mode, but the hardware do not support "
+                   "TF32 speed up, use FP32 instead.";
+    }
+  }
+
+  // TODO(wilber): other precision.
+
+  // TODO(wilber): precision config.
+  switch (build.precision_constraints) {
+    case PrecisionConstraints::kNONE:
+      // It's the default for TensorRT.
+      break;
+    case PrecisionConstraints::kOBEY:
+      config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS);
+      break;
+    case PrecisionConstraints::kPREFER:
+      config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
+      break;
+  }
+
+  // TODO(TRT): DLA config.
+
+  // TODO(TRT): int8 config.
+  // TODO(TRT): support int8
+  if (build.int8) {
+    assert(false);
+    config.setFlag(BuilderFlag::kINT8);
+    bool support_int8 = builder_->platformHasFastInt8();
+    if (support_int8) {
+      LOG(INFO) << "Run INFRT-TRT FP16 mode";
+    }
+  }
+
+  // TODO(TRT): calib config.
+
+  // TODO(TRT): sparse config.
+
+  return true;
+}
+
+bool TRTEngine::SetUpInference(
+    const InferenceOptions& inference,
+    const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
+    std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
+  // TODO(wilber): now only create one exec_context
+  FreshDeviceId();
+  CHECK(engine_ != nullptr);
+  nvinfer1::IExecutionContext* ec = engine_->createExecutionContext();
+  CHECK(ec != nullptr);
+  contexts_.emplace_back(ec);
+  bindings_.emplace_back(new Bindings());
+
+  for (const auto& it : inputs) {
+    const int bind_index = engine_->getBindingIndex(it.first.c_str());
+    bindings_.front()->AddBinding(
+        bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT);
+  }
+  for (auto& it : *outputs) {
+    const int bind_index = engine_->getBindingIndex(it.first.c_str());
+    bindings_.front()->AddBinding(
+        bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT);
+  }
+
+  return true;
+}
+
+void TRTEngine::Run(const phi::GPUContext& ctx) {
+  if (is_dynamic_shape_) {
+    DynamicRun(ctx);
+  } else {
+    StaticRun(ctx);
+  }
+}
+
+void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
+  const int num_bindings = engine_->getNbBindings();
+  std::vector<void*> buffers(num_bindings, nullptr);
+
+  int runtime_batch = -1;
+  auto input_binds = bindings_.front()->GetInputBindings();
+  for (auto bind : input_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    buffers[bind_index] =
+        const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>()));
+    if (runtime_batch != -1) {
+      CHECK_EQ(runtime_batch, phi::vectorize<int64_t>(bind.buffer->dims())[0]);
+    }
+    runtime_batch = bind.buffer->dims()[0];
+  }
+
+  auto output_binds = bindings_.front()->GetOutputBindings();
+  for (auto bind : output_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    std::vector<int32_t> ddim;
+    auto dims = engine_->getBindingDimensions(bind_index);
+    ddim.push_back(runtime_batch);
+    for (int i = 0; i < dims.nbDims; ++i) {
+      ddim.push_back(dims.d[i]);
+    }
+    bind.buffer->Resize(phi::make_ddim(ddim));
+    ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
+    buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
+  }
+
+  contexts_.front()->enqueue(
+      runtime_batch, buffers.data(), ctx.stream(), nullptr);
+}
+
+void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
+  const int num_bindings = engine_->getNbBindings();
+  std::vector<void*> buffers(num_bindings, nullptr);
+
+  auto input_binds = bindings_.front()->GetInputBindings();
+  for (auto bind : input_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    buffers[bind_index] =
+        const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>()));
+    nvinfer1::Dims trt_dims;
+    trt_dims.nbDims = bind.buffer->dims().size();
+
+    for (int i = 0; i < trt_dims.nbDims; ++i) {
+      trt_dims.d[i] = bind.buffer->dims()[i];
+    }
+    contexts_.front()->setBindingDimensions(bind_index, trt_dims);
+  }
+
+  CHECK(contexts_.front()->allInputDimensionsSpecified());
+
+  auto output_binds = bindings_.front()->GetOutputBindings();
+  for (auto bind : output_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    auto dims = contexts_.front()->getBindingDimensions(bind_index);
+    std::vector<int32_t> ddim(dims.nbDims);
+    for (int i = 0; i < dims.nbDims; ++i) {
+      ddim[i] = dims.d[i];
+    }
+    bind.buffer->Resize(phi::make_ddim(ddim));
+    ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
+    buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
+  }
+
+  contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr);
+}
+
+void TRTEngine::FreshDeviceId() {
+  int count;
+  cudaGetDeviceCount(&count);
+  CHECK_LT(device_id_, count);
+  phi::backends::gpu::SetDeviceId(device_id_);
+}
+
+void TRTEngine::GetEngineInfo() {
+#if IS_TRT_VERSION_GE(8200)
+  LOG(INFO) << "====== engine info ======";
+  std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
+      engine_->createEngineInspector());
+  infer_inspector->setExecutionContext(contexts_.front().get());
+  LOG(INFO) << infer_inspector->getEngineInformation(
+      nvinfer1::LayerInformationFormat::kONELINE);
+  LOG(INFO) << "====== engine info end ======";
+#else
+  LOG(INFO) << "Inspector needs TensorRT version 8.2 and after.";
+#endif
+}
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..f72bdaf3ac0b463d086e9aeda62823cc725f2db9
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/infrt/backends/tensorrt/trt_utils.h"
+#include "paddle/phi/backends/dynload/tensorrt.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+using namespace nvinfer1;  // NOLINT
+
+// The trt programing model as follows:
+// 1. The build phase:
+// IBuilder* builder = createInferBuilder(&logger_);
+// 2. Create a network definition:
+// INetworkDefinition* network = builder->createNetworkV2(...);
+// 3. Build network:
+// network->AddLayer(...)
+// 4. Configure network:
+// IBuilderConfig* config = builder->createBuilderConfig();
+// config->setMaxWorkspaceSize(...)
+// 5. Get cuda engine and deserializing a plan:
+// IHostMemory* serialized_model = builder->buildSerializedNetwork(...);
+// IRuntime* runtime = createInferRuntime(&logger_);
+// ICudaEngine* engine = runtime->deserializeCudaEngine(...);
+// 6. Get execution context:
+// IExecutionContext* exec_context = engine->createExecutionContext();
+// 7. Set input data:
+// int32_t input_index = engine->getBindingIndex("input");
+// int32_t output_index = engine->getBindingIndex("output");
+// void* buffers[2];
+// buffers[input_index] = input_buffer;
+// buffers[output_index] = output_buffer;
+// 8. Performance inference:
+// exec_context->enqueueV2(buffers, stream, nullptr);
+//
+// We have encapsulated this logic, please use the following programming model.
+//
+// TRTEngine trt_engine;
+// trt_engine.Build(...);
+// trt_engine.SetUpInference(...);
+// trt_engine.Run(...);
+class TRTEngine {
+ public:
+  explicit TRTEngine(int device_id);
+
+  nvinfer1::IBuilder* GetTrtBuilder();
+
+  // TODO(wilber): Modify signature after infrt-trt ready.
+  void Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+             const BuildOptions& build_options);
+
+  // TODO(wilber): Modify signature after infrt-trt ready.
+  void Run(const phi::GPUContext& ctx);
+
+  // TODO(wilber): How to support multiple execution contexts?
+  bool SetUpInference(
+      const InferenceOptions& inference,
+      const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
+      std::unordered_map<std::string, phi::DenseTensor*>* outputs);
+
+  void GetEngineInfo();
+
+ private:
+  void FreshDeviceId();
+
+  bool SetupNetworkAndConfig(const BuildOptions& build,
+                             INetworkDefinition& network,  // NOLINT
+                             IBuilderConfig& config);      // NOLINT
+
+  bool NetworkToEngine(const BuildOptions& build);
+
+  bool ModelToBuildEnv(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+                       const BuildOptions& build);
+
+  void StaticRun(const phi::GPUContext& ctx);
+
+  void DynamicRun(const phi::GPUContext& ctx);
+
+ private:
+  std::unique_ptr<TrtLogger> logger_{nullptr};
+  TrtUniquePtr<nvinfer1::IBuilder> builder_{nullptr};
+  TrtUniquePtr<INetworkDefinition> network_{nullptr};
+  std::unique_ptr<IHostMemory> serialized_engine_{nullptr};
+  TrtUniquePtr<nvinfer1::ICudaEngine> engine_{nullptr};
+  std::vector<TrtUniquePtr<nvinfer1::IExecutionContext>> contexts_;
+  std::vector<std::unique_ptr<Bindings>> bindings_;
+  int device_id_{0};
+  bool is_dynamic_shape_{false};
+};
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/trt_options.h b/paddle/infrt/backends/tensorrt/trt_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5190f5e6220e682c3d3a3ab564e381a3180caff
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/trt_options.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <NvInfer.h>
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+// Build default params
+constexpr int32_t max_batch_not_provided{0};
+constexpr int32_t default_workspace{16};
+// Inference default params
+constexpr int32_t default_batch{1};
+constexpr int32_t batch_not_provided{0};
+
+enum class PrecisionConstraints { kNONE, kOBEY, kPREFER };
+
+enum class SparsityFlag { kDISABLE, kENABLE, kFORCE };
+
+using ShapeRange =
+    std::array<std::vector<int32_t>,
+               nvinfer1::EnumMax<nvinfer1::OptProfileSelector>()>;
+
+using IOFormat = std::pair<nvinfer1::DataType, nvinfer1::TensorFormats>;
+
+struct BuildOptions {
+  // Set max batch size.
+  int32_t max_batch{max_batch_not_provided};
+
+  // Set workspace size in megabytes (default = 16)
+  int32_t workspace{default_workspace};
+
+  // Enable tf32 precision, in addition to fp32 (default = disabled)
+  bool tf32{false};
+
+  // Enable fp16 precision, in addition to fp32 (default = disabled)
+  bool fp16{false};
+
+  // Enable int8 precision, in addition to fp32 (default = disabled)
+  bool int8{false};
+
+  // Control precision constraints. (default = none)
+  // Precision Constaints: = none, obey, prefer
+  //     none = no constraints
+  //     prefer = meet precision constraints if possible
+  //     obey = meet precision constraints or fail otherwise
+  PrecisionConstraints precision_constraints{PrecisionConstraints::kNONE};
+
+  // Save the serialized engine.
+  bool save{false};
+
+  // Load a serialized engine.
+  bool load{false};
+
+  // Build with dynamic shapes using a profile with the min, max and opt shapes
+  // provided
+  std::unordered_map<std::string, ShapeRange> shapes;
+
+  // Type and format of each of the input tensors (default = all inputs in
+  // fp32:chw)
+  std::vector<IOFormat> input_formats;
+
+  // Type and format of each of the output tensors (default = all outputs in
+  // fp32:chw)
+  std::vector<IOFormat> output_formats;
+};
+
+struct InferenceOptions {
+  int32_t batch{batch_not_provided};
+  std::unordered_map<std::string, std::vector<int32_t>> shapes;
+};
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b129af1d53810c6d37d23270c1118023ae7b3f6
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/trt_utils.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include "glog/logging.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+#define IS_TRT_VERSION_GE(version)                       \
+  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
+
+#define IS_TRT_VERSION_LT(version)                       \
+  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) < version)
+
+#define TRT_VERSION                                    \
+  NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+      NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD
+
+inline nvinfer1::Dims VecToDims(const std::vector<int>& vec) {
+  int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
+  if (static_cast<int>(vec.size()) > limit) {
+    assert(false);
+  }
+  // Pick first nvinfer1::Dims::MAX_DIMS elements
+  nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
+  std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
+  return dims;
+}
+
+template <typename T>
+struct TrtDestroyer {
+  void operator()(T* t) { t->destroy(); }
+};
+
+template <typename T>
+using TrtUniquePtr = std::unique_ptr<T, TrtDestroyer<T>>;
+
+class TrtLogger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity,
+           const char* msg) noexcept override {
+    switch (severity) {
+      case Severity::kVERBOSE:
+        VLOG(3) << msg;
+        break;
+      case Severity::kINFO:
+        VLOG(2) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+  nvinfer1::ILogger& GetTrtLogger() noexcept { return *this; }
+  ~TrtLogger() override = default;
+};
+
+struct Binding {
+  bool is_input{false};
+  nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT};
+  phi::DenseTensor* buffer{nullptr};
+  std::string name;
+};
+
+class Bindings {
+ public:
+  Bindings() = default;
+
+  void AddBinding(int32_t b,
+                  const std::string& name,
+                  bool is_input,
+                  phi::DenseTensor* buffer,
+                  nvinfer1::DataType data_type) {
+    while (bindings_.size() <= static_cast<size_t>(b)) {
+      bindings_.emplace_back();
+    }
+    names_[name] = b;
+    bindings_[b].buffer = buffer;
+    bindings_[b].is_input = is_input;
+    bindings_[b].data_type = data_type;
+    bindings_[b].name = name;
+  }
+
+  std::vector<Binding> GetInputBindings() {
+    return GetBindings([](const Binding& b) -> bool { return b.is_input; });
+  }
+
+  std::vector<Binding> GetOutputBindings() {
+    return GetBindings([](const Binding& b) -> bool { return !b.is_input; });
+  }
+
+  std::vector<Binding> GetBindings() {
+    return GetBindings([](const Binding& b) -> bool { return true; });
+  }
+
+  std::vector<Binding> GetBindings(
+      std::function<bool(const Binding& b)> predicate) {
+    std::vector<Binding> bindings;
+    for (const auto& b : bindings_) {
+      if (predicate(b)) {
+        bindings.push_back(b);
+      }
+    }
+    return bindings;
+  }
+
+ private:
+  std::unordered_map<std::string, int32_t> names_;
+  std::vector<Binding> bindings_;
+};
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index 757d47a8de43e2a394ad5296e617ed6ed94078f3..e35989da2085b21f4dbfaadea05793fc9dcb8753 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -16,7 +16,7 @@ gather_srcs(infrt_src SRCS
 
 mlir_tablegen_on(basic_kernels)
 mlir_tablegen_on(test_kernels)
-mlir_tablegen_on(infrt_base DIALECT infrt)
+mlir_tablegen_on(infrt_base DIALECT Infrt)
 mlir_tablegen_on(tensor_shape DIALECT ts)
 mlir_tablegen_on(dense_tensor DIALECT dt)
 mlir_tablegen_on(pd_op_base DIALECT pd)
@@ -36,6 +36,6 @@ cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_I
 add_subdirectory(infrt)
 add_subdirectory(tensorrt)
 
-if (INFRT_WITH_PTEN)
-    add_subdirectory(pten)
+if (INFRT_WITH_PHI)
+    add_subdirectory(phi)
 endif()
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc
index bad7e73ec5ae5c3216a912729637664bba17d3b0..c1aa75fb24650b99ea8371c0ecbe7e572df2f0ce 100644
--- a/paddle/infrt/dialect/basic_kernels.cc
+++ b/paddle/infrt/dialect/basic_kernels.cc
@@ -90,7 +90,7 @@ static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
 }
 
 static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
-  p << "infrt.call " << op->getAttr("callee") << "(";
+  p << op->getAttr("callee") << "(";
   p.printOperands(op.getOperands());
   p << ")";
   p.printOptionalAttrDict(op->getAttrs(), {"callee"});
@@ -98,7 +98,7 @@ static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
 }
 
 static void printConstant(OpAsmPrinter &p, mlir::Operation *op) {  // NOLINT
-  p << op->getName() << " ";
+  p << " ";
   p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"});
 
   if (op->getAttrs().size() > 1) p << ' ';
@@ -128,7 +128,6 @@ static void print(OpAsmPrinter &p, ConstantI64Op op) {  // NOLINT
 }
 
 static void print(OpAsmPrinter &p, ReturnOp op) {  // NOLINT
-  p << "infrt.return";
   if (op.getNumOperands() > 0) {
     p << ' ';
     p.printOperands(op.getOperands());
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td
index 32845a09351f70fe1acd7659b8c5e3a579ff83e0..aadc146e36280f79902f3b9ed90f3203fb9e5384 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/basic_kernels.td
@@ -48,10 +48,10 @@ def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>;
 def ReturnOp : INFRT_Op<"return", [Terminator]> {
   let summary = "host executor return operation";
   let description = [{
-      The "infrt.return" operation represents a return operation within a function.
+      The "Infrt.return" operation represents a return operation within a function.
 
         func @foo() : (i32, f8) {
-        infrt.return %0, %1 : i32, f8
+        Infrt.return %0, %1 : i32, f8
         }
     }];
 
@@ -112,7 +112,7 @@ def PrintF32Op : PrintOp<"f32", F32>;
 def PrintF64Op : PrintOp<"f64", F64>;
 
 def GetStringOp : INFRT_Op<"get_string"> {
-  let summary = "infrt.get_string";
+  let summary = "Infrt.get_string";
   let description = [{
     Get a !infrt.string value from the given string attribute.
   }];
@@ -124,7 +124,7 @@ def GetStringOp : INFRT_Op<"get_string"> {
 }
 
 def PrintStringOp : INFRT_Op<"print_string"> {
-  let summary = "infrt.print_string";
+  let summary = "Infrt.print_string";
   let description = [{
       An operation that prints a string.
   }];
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
index fde265765c6d2251019403a1a7bc861206d3fe0c..49d6887ada0322065946f95c9e39d932f268375e 100644
--- a/paddle/infrt/dialect/dense_tensor.cc
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -39,52 +39,6 @@ void DTDialect::initialize() {
       >();
 }
 
-llvm::Optional<TargetType> GetTargetType(mlir::StringRef key) {
-  if (key.equals_insensitive("x86"))
-    return TargetType::X86;
-  else if (key.equals_insensitive("cuda"))
-    return TargetType::CUDA;
-  else
-    return llvm::None;
-}
-
-llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key) {
-  if (key.equals_insensitive("nchw"))
-    return LayoutType::NCHW;
-  else if (key.equals_insensitive("nhwc"))
-    return LayoutType::NHWC;
-  else
-    return llvm::None;
-}
-
-llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key) {
-  if (key.equals_insensitive("i32"))
-    return PrecisionType::I32;
-  else if (key.equals_insensitive("f32"))
-    return PrecisionType::F32;
-  else
-    return llvm::None;
-}
-
-TensorType TensorType::get(mlir::MLIRContext *ctx,
-                           TargetType target,
-                           LayoutType layout,
-                           PrecisionType precision) {
-  return Base::get(ctx, target, layout, precision);
-}
-
-TargetType TensorType::target() { return getImpl()->target_; }
-
-LayoutType TensorType::layout() { return getImpl()->layout_; }
-
-PrecisionType TensorType::precision() { return getImpl()->precision_; }
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType) {
-  os << "TensorType<" << tensorType.target() << ", " << tensorType.layout()
-     << ", " << tensorType.precision() << ">";
-  return os;
-}
-
 TensorMapType TensorMapType::get() {
   return Base::get(::infrt::Global::getMLIRContext());
 }
@@ -101,48 +55,6 @@ StringType StringType::get(mlir::MLIRContext *context) {
   return Base::get(context);
 }
 
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type) {
-  switch (type) {
-    case (TargetType::X86):
-      os << "X86";
-      break;
-    case (TargetType::CUDA):
-      os << "CUDA";
-      break;
-    default:
-      os << "Unsupported";
-  }
-  return os;
-}
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type) {
-  switch (type) {
-    case (LayoutType::NCHW):
-      os << "NCHW";
-      break;
-    case (LayoutType::NHWC):
-      os << "NHWC";
-      break;
-    default:
-      os << "Unsupported";
-  }
-  return os;
-}
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type) {
-  switch (type) {
-    case (PrecisionType::I32):
-      os << "I32";
-      break;
-    case (PrecisionType::F32):
-      os << "F32";
-      break;
-    default:
-      os << "Unsupported";
-  }
-  return os;
-}
-
 static mlir::Type getTensorType(mlir::MLIRContext *context) {
   auto t_dialect = mlir::Identifier::get("t", context);
   return mlir::OpaqueType::get(t_dialect, "tensor");
@@ -165,7 +77,7 @@ static mlir::ParseResult parseCreateUninitTensorOp(
 
   if (parser.parseArrow()) return mlir::failure();
   if (parser.parseType(outputRawTypes[0])) return mlir::failure();
-  if (!outputRawTypes[0].isa<TensorType>())
+  if (!outputRawTypes[0].isa<DenseTensorType>())
     return parser.emitError(loc, "invalid kind of type specified");
   result.addTypes(outputTypes);
   return mlir::success();
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index 08ba8d720662b8c7ac4f224d8fe6366d4acc7d3e..b0a1ea412c53eb677fed1a1b76e704f3f3da11e5 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -19,68 +19,10 @@
 
 #include <string>
 
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+
 namespace infrt {
 namespace dt {
-enum class TargetType : uint8_t { X86, CUDA };
-enum class LayoutType : uint8_t { NCHW, NHWC };
-enum class PrecisionType : uint8_t { I32, F32 };
-
-llvm::Optional<TargetType> GetTargetType(mlir::StringRef key);
-llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key);
-llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key);
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type);
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type);
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type);
-
-namespace detail {
-struct TensorTypeStorage : public mlir::TypeStorage {
-  TensorTypeStorage(TargetType target,
-                    LayoutType layout,
-                    PrecisionType precision)
-      : target_(target), layout_(layout), precision_(precision) {}
-
-  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
-
-  bool operator==(const KeyTy &key) const {
-    return key == KeyTy(target_, layout_, precision_);
-  }
-
-  static llvm::hash_code hashKey(const KeyTy &key) {
-    return llvm::hash_value(key);
-  }
-
-  static TensorTypeStorage *construct(
-      mlir::TypeStorageAllocator &allocator,  // NOLINT
-      const KeyTy &key) {
-    return new (allocator.allocate<TensorTypeStorage>())
-        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
-  }
-
-  TargetType target_;
-  LayoutType layout_;
-  PrecisionType precision_;
-};
-}  // namespace detail
-
-class TensorType : public mlir::Type::TypeBase<TensorType,
-                                               mlir::Type,
-                                               detail::TensorTypeStorage> {
- public:
-  using Base::Base;
-
-  static TensorType get(mlir::MLIRContext *ctx,
-                        TargetType target,
-                        LayoutType layout,
-                        PrecisionType precision);
-
-  TargetType target();
-  LayoutType layout();
-  PrecisionType precision();
-};
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType);
-
 class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
                                                   mlir::Type,
                                                   mlir::TypeStorage> {
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
index 75c8a0d88e4c11f5e27d7b6d38062e118475274b..7e6e838a72372d2f850d4fb37f6b2218577ba0ed 100644
--- a/paddle/infrt/dialect/dense_tensor.td
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -28,7 +28,7 @@ class CreateUninitTensorOp<string dtype>
   }];
 
   let arguments = (ins I64ArrayAttr:$shape);
-  let results = (outs TensorType:$output);
+  let results = (outs DenseTensor:$output);
 
   let parser  = [{ return infrt::dt::parseCreateUninitTensorOp(parser, result); }];
   let printer = [{ return infrt::dt::printCreateUninitTensorOp(p, *this); }];
@@ -43,8 +43,8 @@ def ShallowCopyTensorOp
       An operation that copy a tensor shallowly.
   }];
 
-  let arguments = (ins TensorType:$input);
-  let results = (outs TensorType:$output);
+  let arguments = (ins DenseTensor:$input);
+  let results = (outs DenseTensor:$output);
 
   let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
 }
@@ -59,7 +59,7 @@ class FillTensorWithConstantOp<string dtype> :
   }];
 
   let arguments = (ins
-      TensorType:$input,
+      DenseTensor:$input,
       AnyAttr:$value
   );
   let results = (outs);
@@ -77,7 +77,7 @@ def PrintTensorOp : DT_Op<"print_tensor"> {
     An operation that prints a tensor.
   }];
 
-  let arguments = (ins TensorType:$input);
+  let arguments = (ins DenseTensor:$input);
   let results = (outs);
   let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
 }
@@ -90,7 +90,7 @@ class SetTensorOp<string dtype> :
     An operation that sets an input tensor with given values.
   }];
 
-  let arguments = (ins TensorType);
+  let arguments = (ins DenseTensor);
   let results = (outs);
 
   let parser  = [{ return infrt::dt::parseSetTensorOp(parser, result); }];
@@ -125,7 +125,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
           TensorMapType:$map,
           StrAttr:$name
           );
-  let results = (outs TensorType:$output);
+  let results = (outs DenseTensor:$output);
   let assemblyFormat = "`(` operands `)` attr-dict `->` type($output)";
   let verifier = ?;
 }
@@ -149,7 +149,7 @@ def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
       An operation that returns the shape of the input tensor.
   }];
 
-  let arguments = (ins TensorType:$input);
+  let arguments = (ins DenseTensor:$input);
   let results = (outs TS_Shape:$output);
   let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
 }
@@ -162,8 +162,8 @@ class NaiveElementwiseAddOp<string dtype> :
     Naive elementwise_add operation.
     Just for testing.
   }];
-  let arguments = (ins TensorType:$a, TensorType:$b);
-  let results = (outs TensorType:$output);
+  let arguments = (ins DenseTensor:$a, DenseTensor:$b);
+  let results = (outs DenseTensor:$output);
   let assemblyFormat = "`(` $a `,` $b `)` attr-dict `:` `(` type($a) `,` type($b) `)` `->` type($output)";
 }
 
@@ -175,8 +175,8 @@ class NaiveMatmulOp<string dtype> :
     Naive matmul operation.
     Just for testing.
   }];
-  let arguments = (ins TensorType:$x, TensorType:$w);
-  let results = (outs TensorType:$output);
+  let arguments = (ins DenseTensor:$x, DenseTensor:$w);
+  let results = (outs DenseTensor:$output);
   let assemblyFormat = "`(` $x `,` $w `)` attr-dict `:` `(` type($x) `,` type($w) `)` `->` type($output)";
 }
 
diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt
index 98910d8d0ecf0b99bd1eb8b860ed573ae88ef203..daf710e0baf54549a2cc3e7a6e87c7b76a169f29 100644
--- a/paddle/infrt/dialect/infrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/CMakeLists.txt
@@ -1,7 +1,15 @@
 core_gather_headers()
 
 gather_srcs(infrt_src SRCS
+    common_type.cc
     infrt_dialect.cc
     )
 
-add_mlir_dialect(infrt_ops Infrt)
+
+add_mlir_dialect(infrt_ops infrt)
+
+set(LLVM_TARGET_DEFINITIONS infrt_ops.td)
+mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
+mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
+add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
+add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common_type.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5cbd7b2cd6153f3724bc357811bdb0894eeb64ba
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common_type.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/common_type.h"
+
+namespace infrt {
+
+llvm::Optional<TargetType> GetTargetType(llvm::StringRef key) {
+  if (key.equals_insensitive("CPU"))
+    return TargetType::CPU;
+  else if (key.equals_insensitive("GPU"))
+    return TargetType::GPU;
+  else
+    return llvm::None;
+}
+
+llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key) {
+  if (key.equals_insensitive("NCHW"))
+    return LayoutType::NCHW;
+  else if (key.equals_insensitive("NHWC"))
+    return LayoutType::NHWC;
+  else
+    return llvm::None;
+}
+
+llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key) {
+  if (key.equals_insensitive("FP32"))
+    return PrecisionType::FLOAT32;
+  else if (key.equals_insensitive("FP16"))
+    return PrecisionType::FLOAT16;
+  else
+    return llvm::None;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type) {
+  switch (type) {
+    case (TargetType::CPU):
+      os << "CPU";
+      break;
+    case (TargetType::GPU):
+      os << "GPU";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type) {
+  switch (type) {
+    case (LayoutType::NCHW):
+      os << "NCHW";
+      break;
+    case (LayoutType::NHWC):
+      os << "NHWC";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type) {
+  switch (type) {
+    case (PrecisionType::FLOAT32):
+      os << "FP32";
+      break;
+    case (PrecisionType::FLOAT16):
+      os << "FP16";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6d6503c03be5722cf398c8abac4485aae5d9a8c
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common_type.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/ADT/Optional.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Support/raw_ostream.h>
+
+namespace infrt {
+
+enum class TargetType : uint8_t { CPU, GPU, UNK };
+enum class PrecisionType : uint8_t { FLOAT32, FLOAT16, UNK };
+enum class LayoutType : uint8_t { NCHW, NHWC, UNK };
+
+struct Place {
+  TargetType target;
+  PrecisionType precision;
+  LayoutType layout;
+  Place(TargetType tar, PrecisionType pre, LayoutType lay)
+      : target(tar), precision(pre), layout(lay) {}
+  Place()
+      : target(TargetType::UNK),
+        precision(PrecisionType::UNK),
+        layout(LayoutType::UNK) {}
+};
+
+llvm::Optional<TargetType> GetTargetType(llvm::StringRef key);
+llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key);
+llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key);
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type);
+
+}  // end namespace infrt
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/infrt_dialect.cc
index 388de858b6572ea5900851b170d09589387c0b05..abb60016f90233cae68dc99e95885042517e9212 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/infrt_dialect.cc
@@ -23,6 +23,9 @@
 #define GET_TYPEDEF_CLASSES
 #include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"
 
@@ -33,6 +36,12 @@ void InfrtDialect::initialize() {
 #define GET_TYPEDEF_LIST
 #include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"  // NOLINT
       >();
+
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"  // NOLINT
+      >();
+
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"  // NOLINT
@@ -57,36 +66,104 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
 
     // Parse the element type.
     if (parser.parseType(elementType)) return nullptr;
-    // parse ","
-    if (parser.parseComma()) return nullptr;
-
-    // llvm::APInt lod_level;
-    if (parser.parseInteger(lod_level)) return nullptr;
-
+    // parse optional lod_level
+    if (parser.parseOptionalComma().succeeded()) {
+      // llvm::APInt lod_level;
+      if (parser.parseInteger(lod_level)) return nullptr;
+    }
     // parse ">"
     if (parser.parseGreater()) return nullptr;
 
     return LoDTensorType::get(
         parser.getContext(), shape, elementType, lod_level);
   }
+  if (keyword == "dense_tensor") {
+    // parse DenseTensor, for example: !i=Infrt.tensor<X86, CUDA, F32>
+    llvm::StringRef target;
+    llvm::StringRef layout;
+    llvm::StringRef precision;
+
+    // parse "<"
+    if (parser.parseLess()) return mlir::Type();
+    // parse target
+    if (parser.parseKeyword(&target)) return mlir::Type();
+    auto targetType = GetTargetType(target);
+    if (!targetType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown target type: ")
+          << target;
+      return mlir::Type();
+    }
+
+    // parse ","
+    if (parser.parseComma()) return mlir::Type();
+    // parse precision
+    if (parser.parseKeyword(&precision)) return mlir::Type();
+    auto precisionType = GetPrecisionType(precision);
+    if (!precisionType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown precision type: ")
+          << precision;
+      return mlir::Type();
+    }
+
+    // parse ","
+    if (parser.parseComma()) return mlir::Type();
+
+    // parse layout
+    if (parser.parseKeyword(&layout)) return mlir::Type();
+    auto layoutType = GetLayoutType(layout);
+    if (!layoutType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown layout type: ")
+          << layout;
+      return mlir::Type();
+    }
+    // parse ">"
+    if (parser.parseGreater()) return mlir::Type();
+    return DenseTensorType::get(
+        parser.getContext(), *targetType, *precisionType, *layoutType);
+  }
   // Todo: parse other type
   return mlir::Type();
 }
 
 void InfrtDialect::printType(::mlir::Type type,
                              ::mlir::DialectAsmPrinter &os) const {
-  // print TensorType, for example: !infrt.tensor<X86, CUDA, F32>
+  // print LoDTensorType, for example: !Infrt.lod_tensor<3x64x3x3xf32,5>
   if (type.isa<infrt::LoDTensorType>()) {
-    auto lodTensorType = type.cast<infrt::LoDTensorType>();
+    auto lod_tensor_type = type.cast<infrt::LoDTensorType>();
     os << "lod_tensor<";
-    auto shape = lodTensorType.getShape();
-    for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim)
-      os << *dim << 'x';
-    os << shape.back() << 'x' << lodTensorType.getElementType() << ", "
-       << lodTensorType.getLod_level() << ">";
+    auto shape = lod_tensor_type.getShape();
+    for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim) {
+      *dim < 0 ? os << '?' : os << *dim;
+      os << 'x';
+    }
+    shape.back() < 0 ? os << '?' : os << shape.back();
+    os << 'x' << lod_tensor_type.getElementType() << ", "
+       << lod_tensor_type.getLod_level() << ">";
     return;
   }
+
+  // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
+  if (type.isa<infrt::DenseTensorType>()) {
+    auto dense_tensor_type = type.cast<infrt::DenseTensorType>();
+    os << "dense_tensor<" << dense_tensor_type.getTarget() << ", "
+       << dense_tensor_type.getPrecision() << ", "
+       << dense_tensor_type.getLayout() << ">";
+    return;
+  }
+
   llvm_unreachable("unknown infrt type.");
 }
 
+// /// Parse an attribute registered to this dialect.
+// ::mlir::Attribute InfrtDialect::parseAttribute(::mlir::DialectAsmParser
+// &parser,
+//                                    ::mlir::Type type) const {
+//   return mlir::Attribute();
+//                                    }
+// /// Print an attribute registered to this dialect.
+// void InfrtDialect::printAttribute(::mlir::Attribute attr,
+//                       ::mlir::DialectAsmPrinter &os) const {
+
+//                       }
+
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/infrt_dialect.h
index 21a1f6b34f6a5f33bd82c4e78669ee24221a08f1..ed5b36e556149dbc3026e732cf953c5562841921 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.h
+++ b/paddle/infrt/dialect/infrt/infrt_dialect.h
@@ -17,13 +17,19 @@
 //===----------------------------------------------------------------------===//
 // Dialect
 //===----------------------------------------------------------------------===//
+#include <llvm/ADT/StringMap.h>
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/dialect/infrt/common_type.h"
 
 #include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc"
 #define GET_TYPEDEF_CLASSES
 #include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.h.inc"
+
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/infrt/infrt_ops.h.inc"
diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
index 319760973cd90c667793e29761c030141990c242..00f94805c7db22e170c7395598bfe647174339c1 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops.td
@@ -1,34 +1,4 @@
-#ifndef Infrt_OpS
-#define Infrt_OpS
-
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-
-def Infrt_Dialect : Dialect {
-  let summary =
-    "A dialect containing the Infrt Attributes, Operations, and Types";
-
-  let name = "Infrt";
-  let cppNamespace = "::infrt";
-}
-
-// Type definitions
-
-// Base class for Infrt dialect types.
-class Infrt_Type<string name, list<Trait> traits = [],
-                   string baseCppClass = "::mlir::Type">
-    : TypeDef<Infrt_Dialect, name, traits, baseCppClass> {
-}
-
-def LoDTensor : Infrt_Type<"LoDTensor"> {
-  let summary = "infrt lod tensor";
-  let description = [{lod_tensor<3x64x3x3xf32, 3>}];
-  let parameters = (ins
-    ArrayRefParameter<"int64_t">:$shape,
-    "mlir::Type":$elementType,
-    "int32_t":$lod_level
-  );
-}
+include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
 
 // Op definition
 class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, traits> {
@@ -39,14 +9,11 @@ class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, m
   // let parser = [{ return infrt::parse$cppClass(parser, result); }];
 }
 
-// def InfRT_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
-//  let summary = "kernel op";
-//  let description = [{
-//    kernel op!
-//  }];
-// let arguments = (ins StrAttr:$name, PD_Tensor:$X, PD_Tensor:$Y, DefaultValuedAttr<F32Attr, "1.0">:$Alpha, DefaultValuedAttr<F32Attr, "1.0">:$Beta);
-//
-// let results = (outs PD_Tensor:$Out);
-// }
-
-#endif // Infrt_OpS
+def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
+  let summary = "kernel op";
+  let description = [{kernel op!}];
+  let arguments = (ins Variadic<AnyType>:$operands,
+                       StrAttr:$name,
+                       OptionalAttr<DictionaryAttr>:$attrs);
+  let results = (outs Variadic<AnyType>);
+}
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
new file mode 100644
index 0000000000000000000000000000000000000000..81d3d028a66bea29dd9a373e1905ac02468251fd
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -0,0 +1,49 @@
+#ifndef INFRT_OPS_BASE
+#define INFRT_OPS_BASE
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def Infrt_Dialect : Dialect {
+  let summary =
+    "A dialect containing the Infrt Attributes, Operations, and Types";
+
+  let name = "infrt";
+  let cppNamespace = "::infrt";
+}
+
+// Type definitions
+
+// Base class for Infrt dialect types.
+class Infrt_Type<string name, list<Trait> traits = [],
+                   string baseCppClass = "::mlir::Type">
+    : TypeDef<Infrt_Dialect, name, traits, baseCppClass> {
+}
+
+def LoDTensor : Infrt_Type<"LoDTensor"> {
+  let summary = "infrt lod tensor";
+  let description = [{lod_tensor<3x64x3x3xf32, 3>}];
+  let parameters = (ins
+    ArrayRefParameter<"int64_t">:$shape,
+    "mlir::Type":$elementType,
+    "int32_t":$lod_level
+  );
+}
+
+def DenseTensor : Infrt_Type<"DenseTensor"> {
+  let summary = "infrt dense tensor";
+  let description = [{dense_tensor<, 3>}];
+  let parameters = (ins
+    "TargetType":$target,
+    "PrecisionType":$precision,
+    "LayoutType":$layout
+  );
+}
+
+// Base class for infrt dialect attributes.
+class Infrt_Attr<string name, list<Trait> traits = [],
+                   string baseCppClass = "::mlir::Attribute">
+    : AttrDef<Infrt_Dialect, name, traits, baseCppClass> {
+  let mnemonic = ?;
+}
+#endif // INFRT_OPS_BASE
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
index c0101a8c16608bc732f7b786c62ed4ea90ab2628..8c595c06745f1be8453c4d1f08ba00f4d9ceaf90 100644
--- a/paddle/infrt/dialect/infrt_base.cc
+++ b/paddle/infrt/dialect/infrt_base.cc
@@ -27,7 +27,6 @@ void INFRTDialect::initialize() {
   allowUnknownOperations();
 
   addTypes<infrt::dt::StringType>();
-  addTypes<infrt::dt::TensorType>();
   addTypes<infrt::dt::TensorMapType>();
 
   addOperations<
@@ -43,51 +42,6 @@ void INFRTDialect::initialize() {
 mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
   llvm::StringRef keyword;
   if (parser.parseKeyword(&keyword)) return mlir::Type();
-  // parse TensorType, for example: !infrt.tensor<X86, CUDA, F32>
-  if (keyword == "tensor") {
-    llvm::StringRef target;
-    llvm::StringRef layout;
-    llvm::StringRef precision;
-
-    // parse "<"
-    if (parser.parseLess()) return mlir::Type();
-    // parse target
-    if (parser.parseKeyword(&target)) return mlir::Type();
-    auto targetType = infrt::dt::GetTargetType(target);
-    if (!targetType) {
-      parser.emitError(parser.getCurrentLocation(), "unknown target type: ")
-          << target;
-      return mlir::Type();
-    }
-
-    // parse ","
-    if (parser.parseComma()) return mlir::Type();
-    // parse layout
-    if (parser.parseKeyword(&layout)) return mlir::Type();
-    auto layoutType = infrt::dt::GetLayoutType(layout);
-    if (!layoutType) {
-      parser.emitError(parser.getCurrentLocation(), "unknown layout type: ")
-          << layout;
-      return mlir::Type();
-    }
-
-    // parse ","
-    if (parser.parseComma()) return mlir::Type();
-    // parse precision
-    if (parser.parseKeyword(&precision)) return mlir::Type();
-    auto precisionType = infrt::dt::GetPrecisionType(precision);
-    if (!precisionType) {
-      parser.emitError(parser.getCurrentLocation(), "unknown precision type: ")
-          << precision;
-      return mlir::Type();
-    }
-
-    // parse ">"
-    if (parser.parseGreater()) return mlir::Type();
-
-    return infrt::dt::TensorType::get(
-        parser.getContext(), *targetType, *layoutType, *precisionType);
-  }
   // parse TensorMapType, for example: !infrt.tensor_map
   if (keyword == "tensor_map") {
     return infrt::dt::TensorMapType::get();
@@ -104,13 +58,6 @@ mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
 
 void INFRTDialect::printType(mlir::Type type,
                              mlir::DialectAsmPrinter &printer) const {
-  // print TensorType, for example: !infrt.tensor<X86, CUDA, F32>
-  if (type.isa<infrt::dt::TensorType>()) {
-    auto tensorType = type.cast<infrt::dt::TensorType>();
-    printer << "tensor<" << tensorType.target() << ", " << tensorType.layout()
-            << ", " << tensorType.precision() << ">";
-    return;
-  }
   // print TensorMapType, for example: !infrt.tensor_map
   if (type.isa<infrt::dt::TensorMapType>()) {
     printer << "tensor_map";
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
index 4021a5a6d3cd2b6d7ca272b69c6cc477ea25cad2..3ef73171dcdea4e0367837f4b3893405c29a1580 100644
--- a/paddle/infrt/dialect/infrt_base.h
+++ b/paddle/infrt/dialect/infrt_base.h
@@ -43,7 +43,7 @@ class INFRTDialect : public mlir::Dialect {
   friend class mlir::MLIRContext;
 
  public:
-  static ::llvm::StringRef getDialectNamespace() { return "infrt"; }
+  static ::llvm::StringRef getDialectNamespace() { return "Infrt"; }
 };
 }  // namespace dialect
 
@@ -54,6 +54,20 @@ static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
   return b.getIntegerAttr(b.getI32Type(), constant);
 }
 
+template <typename T>
+static mlir::IntegerAttr createSI32Attr(mlir::OpBuilder &b,  // NOLINT
+                                        mlir::Location loc,
+                                        T constant) {
+  return b.getSI32IntegerAttr(constant);
+}
+
+template <typename T>
+static mlir::FloatAttr createF32Attr(mlir::OpBuilder &b,  // NOLINT
+                                     mlir::Location loc,
+                                     T constant) {
+  return b.getF32FloatAttr(constant);
+}
+
 static mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
     const mlir::Value &operand) {
   return mlir::SmallVector<mlir::Value, 4>(1, operand);
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
index 1abd294236d93cfb0aa7ce70db25f2acfb57a06a..0f50eb2d8fb4ac83578f13888d05188a9143382f 100644
--- a/paddle/infrt/dialect/infrt_base.td
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -2,9 +2,10 @@
 #define INFRT_BASE
 
 include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
 
 def INFRT_Dialect : Dialect {
-  let name = "infrt";
+  let name = "Infrt";
 
   let description = [{
     The INFRT host dialect.
@@ -18,9 +19,6 @@ def StringType :
     Type<CPred<"$_self.isa<::infrt::dt::StringType>()">, "!infrt.string type">,
     BuildableType<"$_builder.getType<::infrt::dt::StringType>()">;
 
-def TensorType :
-    Type<CPred<"$_self.isa<::infrt::dt::TensorType>()">, "!infrt.tensor type">;
-
 def TensorMapType :
     Type<CPred<"$_self.isa<::infrt::dt::TensorMapType>()">, "!infrt.tensor_map type">,
     BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">;
@@ -30,6 +28,12 @@ def BufferType : OpaqueType<"b", "buffer", "buffer">;
 class INFRT_createI32Attr<string value> : NativeCodeCall<
     "infrt::createI32Attr($_builder, $_loc, " # value # ")">;
 
+class INFRT_createSI32Attr<string value> : NativeCodeCall<
+    "infrt::createSI32Attr($_builder, $_loc, " # value # ")">;
+
+class INFRT_createF32Attr<string value> : NativeCodeCall<
+    "infrt::createF32Attr($_builder, $_loc, " # value # ")">;
+
 def INFRT_cvtValueToValueRange : NativeCodeCall<
     "infrt::cvtValueToValueRange($0)">;
 
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index 090f1aea289109feda54b12131daf2993ea4e5e0..b5b8de7a20d0866802b8ce72e12dd7ed35dccbd1 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -21,8 +21,8 @@
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.h"
-#include "paddle/infrt/dialect/pten/pten_base.h"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/phi_base.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
@@ -32,9 +32,9 @@ void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
                   infrt::InfrtDialect,
                   dt::DTDialect,
                   mlir::pd::PaddleDialect,
-#ifdef INFRT_WITH_PTEN
-                  pten::PTENDenseTensorDialect,
-                  pten::PTENDialect
+#ifdef INFRT_WITH_PHI
+                  phi::PHIDenseTensorDialect,
+                  phi::PHIDialect
 #endif
                   >();
 }
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
index 11150530730444ed74f547b9bb8abef5473c61b0..2f721e49a63096d1c3168805d373cbc8809542da 100644
--- a/paddle/infrt/dialect/mlir_loader_test.cc
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -32,13 +32,13 @@ TEST(MlirLoader, basic) {
 
   auto source = R"ROC(
 func @main() -> f32 {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
-  "infrt.print.f32"(%v0) : (f32) -> ()
+  "Infrt.print.f32"(%v0) : (f32) -> ()
 
-  infrt.return %value : f32
+  Infrt.return %value : f32
 }
 )ROC";
 
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index a61a4645eff76cc1fdcbf5176bf4d3e9a606f89e..266bdf60de788df0507a5bf0ef679945cb7c2abc 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -6,7 +6,7 @@
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt/infrt_ops.td"
+include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
 
 def PD_Dialect : Dialect {
   let name = "pd";
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index 7cf5b2fb20f527eefe31f817c7fe85c7864c8669..338b04e001320289b71f6127318e7a073cefcacf 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -24,11 +24,11 @@
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
 
-#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
-
 namespace mlir {
 namespace pd {
 
+#include "paddle/infrt/dialect/rewrite.cpp.inc"  // NOLINT
+
 PaddleDialect::PaddleDialect(MLIRContext *context)
     : Dialect("pd", context, TypeID::get<PaddleDialect>()) {
   addOperations<
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..626b02c1f790d0a7f38887be33dace1c773a2cb1
--- /dev/null
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -0,0 +1,18 @@
+if (NOT INFRT_WITH_PHI)
+    return()
+endif()
+
+#mlir_tablegen_on(infrt_phi_base DIALECT phi)
+add_mlir_dialect(infrt_phi_base phi)
+add_mlir_dialect(infrt_phi_tensor phi_dt)
+add_mlir_dialect(infrt_phi_kernel phi_kernel)
+#mlir_tablegen_on(infrt_phi_tensor)
+
+gather_srcs(infrt_src SRCS
+    phi_base.cc infrt_phi_tensor.cc
+    infrt_phi_tensor.cc)
+
+add_subdirectory(pass)
+
+add_executable(phi-exec phi_exec.cc)
+target_link_libraries(phi-exec infrt)
diff --git a/paddle/infrt/dialect/pten/infrt_pten_base.td b/paddle/infrt/dialect/phi/infrt_phi_base.td
similarity index 54%
rename from paddle/infrt/dialect/pten/infrt_pten_base.td
rename to paddle/infrt/dialect/phi/infrt_phi_base.td
index 20a43f9a92620debd4cf382222de5f9dfe93b9a2..907f912d9e638ba76e5010d5442381d1aa053bc2 100644
--- a/paddle/infrt/dialect/pten/infrt_pten_base.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_base.td
@@ -1,28 +1,30 @@
-#ifndef PTEN_BASE
-#define PTEN_BASE
+#ifndef PHI_BASE
+#define PHI_BASE
 
 include "mlir/IR/OpBase.td"
 
-def PTEN_Dialect : Dialect {
-  let name = "pten";
+def PHI_Dialect : Dialect {
+  let name = "phi";
 
   let description = [{
-    The PTEN host dialect.
+    The PHI host dialect.
   }];
 
-  let cppNamespace = "::infrt::pten";
+  let cppNamespace = "::infrt::phi";
 }
 
 class AllocatorTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PTEN_Dialect, place # "Allocator", traits> {
-    let summary = !strconcat("!pten.allocator_", place, " type");
+    TypeDef<PHI_Dialect, place # "Allocator", traits> {
+    let summary = !strconcat("!phi.allocator_", place, " type");
 }
 
 class ContextTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PTEN_Dialect, place # "Context", traits> {
-    let summary = !strconcat("!pten.context_", place, " type");
+    TypeDef<PHI_Dialect, place # "Context", traits> {
+    let summary = !strconcat("!phi.context_", place, " type");
 }
 
+def PhiOpTrait : NativeOpTrait<"PhiOpTrait">;
+
 def CPU_Allocator : AllocatorTypeOf<"CPU">;
 def GPU_Allocator : AllocatorTypeOf<"GPU">;
 
diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/infrt_phi_kernel.td
new file mode 100644
index 0000000000000000000000000000000000000000..879994907cc0d951bde838b23fd129e865a360f2
--- /dev/null
+++ b/paddle/infrt/dialect/phi/infrt_phi_kernel.td
@@ -0,0 +1,29 @@
+#ifndef PHI_KERNEL
+#define PHI_KERNEL
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/phi/infrt_phi_base.td"
+
+def PHI_KernelDialect : Dialect {
+  let name = "phi_kernel";
+
+  let description = [{
+    The PHI Kernel dialect.
+  }];
+
+  let cppNamespace = "::infrt::phi";
+}
+
+// PHI Kernel related ops.
+class PDT_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_KernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
+}
+
+def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> {
+  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x);
+  let results = (outs DenseTensor:$output);
+}
+
+#endif
+
diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.cc b/paddle/infrt/dialect/phi/infrt_phi_tensor.cc
similarity index 65%
rename from paddle/infrt/dialect/pten/infrt_pten_tensor.cc
rename to paddle/infrt/dialect/phi/infrt_phi_tensor.cc
index b3e99da8750fb9691833256b2d7d1f09aae2e27c..9df1a47031b1f726578291f628cda7d12900bcb7 100644
--- a/paddle/infrt/dialect/pten/infrt_pten_tensor.cc
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.cc
@@ -12,25 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.h"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
 
 #include <mlir/IR/BuiltinTypes.h>
 
-#include "paddle/infrt/dialect/pten/infrt_pten_tensorDialect.cpp.inc"
-#include "paddle/infrt/dialect/pten/infrt_pten_tensorTypes.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc"
 
 namespace infrt {
-namespace pten {
+namespace phi {
 
-void PTENDenseTensorDialect::initialize() {
+void PHIDenseTensorDialect::initialize() {
 #define GET_OP_LIST
   addOperations<
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"
       >();
 }
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.h b/paddle/infrt/dialect/phi/infrt_phi_tensor.h
similarity index 83%
rename from paddle/infrt/dialect/pten/infrt_pten_tensor.h
rename to paddle/infrt/dialect/phi/infrt_phi_tensor.h
index 5fe259300d2aec32fade1141de2dbf8cef314687..2780f9759185ef45bc19f43fc621f46eabbe7a66 100644
--- a/paddle/infrt/dialect/pten/infrt_pten_tensor.h
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.h
@@ -29,11 +29,11 @@
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-#include "paddle/infrt/dialect/pten/infrt_pten_tensorDialect.h.inc"
-#include "paddle/infrt/dialect/pten/infrt_pten_tensorTypes.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc"
 
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/pten/pten_base.h"
+#include "paddle/infrt/dialect/phi/phi_base.h"
 // NOLINT
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc"
diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.td b/paddle/infrt/dialect/phi/infrt_phi_tensor.td
similarity index 63%
rename from paddle/infrt/dialect/pten/infrt_pten_tensor.td
rename to paddle/infrt/dialect/phi/infrt_phi_tensor.td
index 528f0f919680d65dd9636b96686838b427459eff..b7b3b061fdbe42909ac503d9d387cb8aed6bdc1a 100644
--- a/paddle/infrt/dialect/pten/infrt_pten_tensor.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.td
@@ -1,36 +1,36 @@
-#ifdef PTEN_TENSOR
+#ifdef PHI_TENSOR
 #else
-#define PTEN_TENSOR
+#define PHI_TENSOR
 
-include "paddle/infrt/dialect/pten/infrt_pten_base.td"
+include "paddle/infrt/dialect/phi/infrt_phi_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
 
-def PTEN_DenseTensorDialect : Dialect {
-  let name = "pten_dt";
+def PHI_DenseTensorDialect : Dialect {
+  let name = "phi_dt";
 
   let description = [{
-    The PTEN DenseTensor dialect.
+    The PHI DenseTensor dialect.
   }];
 
-  let cppNamespace = "::infrt::pten";
+  let cppNamespace = "::infrt::phi";
 }
 
-// PTEN DenseTensor related Op.
-class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PTEN_DenseTensorDialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+// PHI DenseTensor related Op.
+class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
 }
 
 class CreateDenseTensorOp<string place, string dtype, string layout> 
       : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> {
   let arguments = (ins CPU_Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod);
-  let results = (outs TensorType:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 class FillDenseTensorOp<Attr attr_type, string dtype> :
       PDT_Op<"fill_dense_tensor." # dtype> {
   let arguments = (ins
-      TensorType:$input,
+      DenseTensor:$input,
       attr_type:$value
   );
   let results = (outs);
@@ -53,4 +53,9 @@ def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
 def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp;
 def PDT_CreateContextOp_cpu : CreateCPUContextOp;
 
+def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
+  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
+  let results = (outs DenseTensor:$output);
+}
+
 #endif
diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5c55a6b0acaed7be9ee86b4662d895d08ca05bdc
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    proto_arg_map_context.cc
+    phi_op_cvt_pass.cc
+    kernel_op_desc.cc
+    )
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..63869b7d7b9ea4fd7841dfe352a3b79e9cd18725
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include <glog/logging.h>
+#include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace infrt {
+
+phi::Backend cvtTarget2Phi(TargetType target) {
+  switch (target) {
+    case TargetType::CPU:
+      return phi::Backend::CPU;
+    case TargetType::GPU:
+      return phi::Backend::GPU;
+    default:
+      return phi::Backend::UNDEFINED;
+  }
+}
+
+TargetType cvtTargetFromPhi(phi::Backend backend) {
+  switch (backend) {
+    case phi::Backend::CPU:
+      return TargetType::CPU;
+    case phi::Backend::GPU:
+      return TargetType::GPU;
+    default:
+      return TargetType::UNK;
+  }
+}
+
+phi::DataType cvtPrecision2Phi(PrecisionType precision) {
+  switch (precision) {
+    case PrecisionType::FLOAT32:
+      return phi::DataType::FLOAT32;
+      break;
+    case PrecisionType::FLOAT16:
+      return phi::DataType::FLOAT16;
+    default:
+      return phi::DataType::UNDEFINED;
+  }
+}
+
+PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
+  switch (datatype) {
+    case phi::DataType::FLOAT32:
+      return PrecisionType::FLOAT32;
+    case phi::DataType::FLOAT16:
+      return PrecisionType::FLOAT16;
+    default:
+      return PrecisionType::UNK;
+  }
+}
+
+phi::DataLayout cvtLayout2Phi(LayoutType layout) {
+  switch (layout) {
+    case LayoutType::NCHW:
+      return phi::DataLayout::NCHW;
+    case LayoutType::NHWC:
+      return phi::DataLayout::NHWC;
+    default:
+      return phi::DataLayout::UNDEFINED;
+  }
+}
+
+LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
+  switch (layout) {
+    case phi::DataLayout::NCHW:
+      return LayoutType::NCHW;
+    case phi::DataLayout::NHWC:
+      return LayoutType::NHWC;
+    default:
+      return LayoutType::UNK;
+  }
+}
+
+phi::KernelKey cvtPlace2Phi(const Place& place) {
+  return phi::KernelKey(cvtTarget2Phi(place.target),
+                        cvtLayout2Phi(place.layout),
+                        cvtPrecision2Phi(place.precision));
+}
+
+Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
+  return Place(cvtTargetFromPhi(tensor_arg.backend),
+               cvtPrecisionFromPhi(tensor_arg.dtype),
+               cvtLayoutFromPhi(tensor_arg.layout));
+}
+
+std::vector<PhiKernelDesc> getCandidateKernels(
+    std::string name, const std::vector<Place>& valid_palces) {
+  std::vector<PhiKernelDesc> candidate_kernels;
+  PhiKernelDesc phi_kernel_desc;
+  phi::KernelKeyMap kernel_key_map =
+      phi::KernelFactory::Instance().SelectKernelMap(name);
+  for (const Place& place : valid_palces) {
+    phi::KernelKey kernel_key = cvtPlace2Phi(place);
+    if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) {
+      kernel_key = phi::KernelKey(kernel_key.backend(),
+                                  phi::DataLayout::ALL_LAYOUT,
+                                  kernel_key.dtype());
+      if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue;
+    }
+    phi_kernel_desc.kernelType = place;
+    phi_kernel_desc.inputsType.clear();
+    phi_kernel_desc.outputsType.clear();
+    phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def();
+    const paddle::SmallVector<phi::TensorArgDef>& input_arg =
+        args_def.input_defs();
+    const paddle::SmallVector<phi::TensorArgDef>& output_arg =
+        args_def.output_defs();
+    for (auto tensor_arg : input_arg) {
+      phi_kernel_desc.inputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+    }
+    for (auto tensor_arg : output_arg) {
+      phi_kernel_desc.outputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+    }
+    candidate_kernels.emplace_back(phi_kernel_desc);
+  }
+  return candidate_kernels;
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pten/pten_base.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
similarity index 65%
rename from paddle/infrt/dialect/pten/pten_base.h
rename to paddle/infrt/dialect/phi/pass/kernel_op_desc.h
index c3be6ef4e8bf407ad31ed6318fa249b8e3e55ca5..b74107f674e51f6ca09c864d197d9334a08666ac 100644
--- a/paddle/infrt/dialect/pten/pten_base.h
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -13,18 +13,20 @@
 // limitations under the License.
 
 #pragma once
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/OpDefinition.h>
-#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include <string>
+#include <vector>
+#include "paddle/infrt/dialect/infrt/common_type.h"
 
-#include "paddle/infrt/dialect/pten/infrt_pten_base.h.inc"
-#include "paddle/infrt/dialect/pten/infrt_pten_baseDialect.h.inc"
+namespace infrt {
 
-#define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/pten/infrt_pten_baseTypes.h.inc"
+struct PhiKernelDesc {
+  std::vector<Place> inputsType;   // kernel input place
+  std::vector<Place> outputsType;  // kernel output place
+  Place kernelType;                // kernel place
+};
+
+std::vector<PhiKernelDesc> getCandidateKernels(
+    std::string name, const std::vector<Place>& valid_palces);
 
-namespace infrt {
-namespace pten {}  // namespace pten
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df3472aa01dfb8bfa0e7f6122410c1b4788cd359
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+
+#include <glog/logging.h>
+#include <llvm/ADT/SetVector.h>
+#include <mlir/Analysis/SliceAnalysis.h>
+#include <mlir/IR/Builders.h>
+#include <list>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/ops/compat/signatures.h"
+namespace infrt {
+// Implementation of the phiOpCvtPass.
+void phiOpCvtPass::runOnFunction() {
+  convertStage();
+  diapatchStage();
+}
+void phiOpCvtPass::convertStage() {
+  mlir::Block &body = getFunction().front();
+  std::vector<mlir::Operation *> worklist;
+  for (auto &op : body.without_terminator()) {
+    worklist.push_back(&op);
+  }
+  mlir::OpBuilder builder(&body, body.begin());
+  while (!worklist.empty()) {
+    auto *op = worklist.back();
+    worklist.pop_back();
+    if (op == nullptr) continue;
+
+    std::string op_name = op->getName().getIdentifier().str();
+
+    // only convert op in pd dialect.
+    if (op_name.substr(0, 3) != "pd.") continue;
+    op_name = op_name.substr(3);
+    if (pd_dialect_inputs_info_map_.find(op_name) ==
+            pd_dialect_inputs_info_map_.end() ||
+        pd_dialect_outputs_info_map_.find(op_name) ==
+            pd_dialect_outputs_info_map_.end()) {
+      // Todo: print log
+      continue;
+    }
+
+    phi::KernelSignature kernel_sign =
+        phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
+            ProtoArgumentMappingContext(op));
+    // resort input&output according to kernel_sign
+    ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
+    ::llvm::SmallVector<mlir::Type, 4> output_types;
+    for (const std::string &str : std::get<0>(kernel_sign.args)) {
+      if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
+        // Todo: print error log
+        return;
+      }
+      uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
+      inputs.push_back(op->getOperands()[index]);
+    }
+
+    for (const std::string &str : std::get<2>(kernel_sign.args)) {
+      if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
+        // Todo: print error log
+        return;
+      }
+      uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
+      output_types.push_back(op->getResultTypes()[index]);
+      ori_output.push_back(op->getResult(index));
+    }
+
+    auto loc = getFunction().getLoc();
+    builder.setInsertionPoint(op);
+    auto kernel_op = builder.create<infrt::KernelOp>(
+        loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
+    for (size_t index = 0; index < ori_output.size(); ++index) {
+      ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
+    }
+    if (!op->use_empty()) {
+      // Todo: print error log
+      return;
+    }
+    op->erase();
+  }
+}
+void phiOpCvtPass::diapatchStage() {
+  std::vector<infrt::KernelOp> worklist;
+  mlir::Block &block = getFunction().front();
+  for (auto &op : block) {
+    infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
+    if (nullptr != kernel_op) worklist.push_back(kernel_op);
+  }
+  // ToDo: implementation in the next PR
+  while (!worklist.empty()) {
+    // infrt::KernelOp kernel_op = worklist.back();
+    worklist.pop_back();
+    // std::string kernel_name = kernel_op.name().str();
+    // std::vector<PhiKernelDesc> candidates =
+    //     getCandidateKernels(kernel_name, valid_places_);
+  }
+}
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..051fee9b61a24772ff2295280fa1b0a1588d7bae
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt/common_type.h"
+
+namespace infrt {
+/*
+ * phiOpCvtPass.
+ *
+ * Convert the general operators in pd Dialect to a infrt.kernelOp.
+ *
+ * source func:
+ *
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %c = "pd.conv2d"(%a) ...
+ *  %d = "pd.conv3d"(%c) ...
+ *  %f = "pd.conv2d"(%a) ...
+ *  "pd.fetch" (%d, %f)
+ * }
+ *
+ * destination func:
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %c = "infrt.kernel"(%a){name = "conv2d"} ...
+ *  %d = "infrt.kernel"(%c){name = "conv3d"}...
+ *  %f = "infrt.kernel"(%a){name = "conv2d"}...
+ *  "pd.fetch" (%d, %f)
+ * }
+ */
+class phiOpCvtPass
+    : public mlir::PassWrapper<phiOpCvtPass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "phiOpCvtPass"; }
+  void runOnFunction() override;
+  explicit phiOpCvtPass(std::vector<Place> valid_places = std::vector<Place>())
+      : valid_places_(valid_places) {}
+
+ private:
+  void convertStage();
+  void diapatchStage();
+  std::vector<Place> valid_places_;
+};
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..64b184359700ee2625e3c61d21617619a50771e3
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
+
+namespace infrt {
+
+bool ProtoArgumentMappingContext::HasInput(const std::string& name) const {
+  if (input_map_.find(name) == input_map_.end()) {
+    return false;
+  }
+  uint8_t index = input_map_.at(name);
+  return static_cast<bool>(op_->getOperand(index));
+}
+
+bool ProtoArgumentMappingContext::HasOutput(const std::string& name) const {
+  if (output_map_.find(name) == output_map_.end()) {
+    return false;
+  }
+  return true;
+}
+
+bool ProtoArgumentMappingContext::HasAttr(const std::string& name) const {
+  return op_->hasAttr(name);
+}
+
+paddle::any ProtoArgumentMappingContext::Attr(const std::string& name) const {
+  mlir::Attribute attrs = op_->getAttr(name);
+  if (mlir::StringAttr str_attr = attrs.dyn_cast_or_null<mlir::StringAttr>()) {
+    return paddle::any(str_attr.str());
+  } else {
+    // ToDO: implementation in the ext PR.
+    return paddle::any(0);
+  }
+}
+
+size_t ProtoArgumentMappingContext::InputSize(const std::string& name) const {
+  return op_->getNumOperands();
+}
+size_t ProtoArgumentMappingContext::OutputSize(const std::string& name) const {
+  return op_->getNumResults();
+}
+
+bool ProtoArgumentMappingContext::IsDenseTensorInput(
+    const std::string& name) const {
+  return true;
+}
+bool ProtoArgumentMappingContext::IsSelectedRowsInput(
+    const std::string& name) const {
+  return false;
+}
+
+bool ProtoArgumentMappingContext::IsDenseTensorOutput(
+    const std::string& name) const {
+  return true;
+}
+bool ProtoArgumentMappingContext::IsSelectedRowsOutput(
+    const std::string& name) const {
+  return false;
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..843b19d217feb332a278c80378aaeb856442de9a
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mlir/IR/Operation.h>
+#include <unordered_map>
+#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
+
+namespace infrt {
+class ProtoArgumentMappingContext : public phi::ArgumentMappingContext {
+ public:
+  // only support op in pd dialect
+  explicit ProtoArgumentMappingContext(mlir::Operation* op)
+      : op_(op),
+        input_map_(pd_dialect_inputs_info_map_.at(
+            op->getName().getIdentifier().str().substr(3))),
+        output_map_(pd_dialect_outputs_info_map_.at(
+            op->getName().getIdentifier().str().substr(3))) {}
+  bool HasInput(const std::string& name) const override;
+  bool HasOutput(const std::string& name) const override;
+  bool HasAttr(const std::string& name) const override;
+
+  // now we can't use Attribute here, it will cause phi relay on
+  // boost::variant and BlockDesc
+  paddle::any Attr(const std::string& name) const override;
+
+  size_t InputSize(const std::string& name) const override;
+  size_t OutputSize(const std::string& name) const override;
+
+  bool IsDenseTensorInput(const std::string& name) const override;
+  bool IsSelectedRowsInput(const std::string& name) const override;
+
+  bool IsDenseTensorOutput(const std::string& name) const override;
+  bool IsSelectedRowsOutput(const std::string& name) const override;
+
+ private:
+  mlir::Operation* op_;
+  const std::unordered_map<std::string, uint8_t>& input_map_;
+  const std::unordered_map<std::string, uint8_t>& output_map_;
+};
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pten/pten_base.cc b/paddle/infrt/dialect/phi/phi_base.cc
similarity index 75%
rename from paddle/infrt/dialect/pten/pten_base.cc
rename to paddle/infrt/dialect/phi/phi_base.cc
index ba87787dd7f7caa73a1387c687a96c44c52d26d0..a1caa40f6383b5016a9e237733a0b3ef016cbc97 100644
--- a/paddle/infrt/dialect/pten/pten_base.cc
+++ b/paddle/infrt/dialect/phi/phi_base.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/pten/pten_base.h"
+#include "paddle/infrt/dialect/phi/phi_base.h"
 
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Dialect.h>
@@ -21,14 +21,14 @@
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
 #include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/pten/infrt_pten_base.cpp.inc"
-#include "paddle/infrt/dialect/pten/infrt_pten_baseDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc"
 
 namespace infrt {
-namespace pten {
+namespace phi {
 
-void PTENDialect::printType(::mlir::Type type,
-                            mlir::DialectAsmPrinter& os) const {
+void PHIDialect::printType(::mlir::Type type,
+                           mlir::DialectAsmPrinter& os) const {
   if (type.isa<CPUAllocatorType>()) {
     os << "CPU_Allocator";
     return;
@@ -48,18 +48,18 @@ void PTENDialect::printType(::mlir::Type type,
   llvm_unreachable("unexpected 'allocator/context' type kind");
 }
 
-void PTENDialect::initialize() {
+void PHIDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
-#include "paddle/infrt/dialect/pten/infrt_pten_base.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"  // NOLINT
       >();
   addTypes<
 #define GET_TYPEDEF_LIST
-#include "paddle/infrt/dialect/pten/infrt_pten_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
       >();
 }
 
-mlir::Type PTENDialect::parseType(mlir::DialectAsmParser& parser) const {
+mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
   llvm::StringRef keyword;
   if (parser.parseKeyword(&keyword)) return mlir::Type();
   if (keyword == "CPU_allocator") {
@@ -77,8 +77,8 @@ mlir::Type PTENDialect::parseType(mlir::DialectAsmParser& parser) const {
   return mlir::Type();
 }
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace infrt
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/pten/infrt_pten_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/phi_base.h b/paddle/infrt/dialect/phi/phi_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..11174290f92bd18fdc91588d7eba89f61bb05413
--- /dev/null
+++ b/paddle/infrt/dialect/phi/phi_base.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include <string>
+
+#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc"
+
+#define GET_TYPEDEF_CLASSES
+#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc"
+
+namespace mlir {
+namespace OpTrait {
+
+template <typename ConcreteType>
+class PhiOpTrait : public OpTrait::TraitBase<ConcreteType, PhiOpTrait> {
+ public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return LogicalResult::success();
+  }
+};
+
+}  // namespace OpTrait
+}  // namespace mlir
+
+namespace infrt {
+namespace phi {}  // namespace phi
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e99661a6a20590e7d36c1cf3a0e1e5d334b2464
--- /dev/null
+++ b/paddle/infrt/dialect/phi/phi_exec.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
+#include <iostream>
+#include <string>
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+
+int main(int argc, char** argv) {
+  static llvm::cl::opt<std::string> input_file(
+      llvm::cl::Positional,
+      llvm::cl::desc("Specify input filename"),
+      llvm::cl::init("-"));
+
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
+
+  module->dump();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+  module->dump();
+  return 0;
+}
diff --git a/paddle/infrt/dialect/pten/CMakeLists.txt b/paddle/infrt/dialect/pten/CMakeLists.txt
deleted file mode 100644
index b4ed5cdc1d82fd4a32f8594dc41b6e32c3e52459..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/pten/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-if (NOT INFRT_WITH_PTEN)
-    return()
-endif()
-
-#mlir_tablegen_on(infrt_pten_base DIALECT pten)
-add_mlir_dialect(infrt_pten_base pten)
-add_mlir_dialect(infrt_pten_tensor pten_dt)
-add_mlir_dialect(infrt_pten_kernel pten_kernel)
-#mlir_tablegen_on(infrt_pten_tensor)
-
-gather_srcs(infrt_src SRCS
-    pten_base.cc infrt_pten_tensor.cc
-    infrt_pten_tensor.cc)
diff --git a/paddle/infrt/dialect/pten/infrt_pten_kernel.td b/paddle/infrt/dialect/pten/infrt_pten_kernel.td
deleted file mode 100644
index a3a1609d9918aea754666b8ec0bcc467fad4d756..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/pten/infrt_pten_kernel.td
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef PTEN_KERNEL
-#define PTEN_KERNEL
-
-include "paddle/infrt/dialect/pten/infrt_pten_tensor.td"
-
-def PTEN_KernelDialect : Dialect {
-  let name = "pten_kernel";
-
-  let description = [{
-    The PTEN Kernel dialect.
-  }];
-
-  let cppNamespace = "::infrt::pten";
-}
-
-// PTEN Kernel related ops.
-class PDT_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PTEN_KernelDialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
-}
-
-def FakeKernelOp : PDT_Kernel<"pten.matmul.host.fp32"> {
-  let arguments = (ins CPU_Context:$dev_ctx, TensorType:$x, TensorType:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
-  let results = (outs TensorType:$output);
-}
-
-#endif
-
diff --git a/paddle/infrt/dialect/tensorrt/CMakeLists.txt b/paddle/infrt/dialect/tensorrt/CMakeLists.txt
index 794266513eb81b36655f44bfd1f6623216690ac5..99c335ed1782e8089f77bb3f21aadb00f6f6864f 100755
--- a/paddle/infrt/dialect/tensorrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/tensorrt/CMakeLists.txt
@@ -2,11 +2,13 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     trt_ops.cc
+    trt_op_converter_pass.cc
     trt_op_teller_pass.cc
     trt_graph_fuse_pass.cc
     trt_graph_split_pass.cc
     )
 mlir_tablegen_on(trt_ops)
+mlir_add_rewriter(pd_lower_to_trt)
 
 add_executable(trt-exec trt_exec.cc)
 target_link_libraries(trt-exec infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
new file mode 100644
index 0000000000000000000000000000000000000000..701391a750354938efe3703ef8642b21f8a878ea
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -0,0 +1,28 @@
+#ifndef PD_LOWER_TO_TRT
+#define PD_LOWER_TO_TRT
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/tensorrt/trt_ops.td"
+
+def PD2TRT_Matmul_Lower : Pat<
+        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">, ConstantAttr<SI32Attr, "1">),
+        (TRT_MatrixMultiplyOp $X, $transpose_X, $Y, $transpose_Y)>;
+
+//TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum nvinfer1::ElementWiseOperation::kSUM
+def PD2TRT_ElementwiseAdd_Lower : Pat<
+        (PD_Elementwise_addOp $X, $Y, ConstantAttr<SI32Attr, "-1">),
+        (TRT_ElementWiseOp $X, $Y, (INFRT_createSI32Attr<"0">)/*kSUM*/)>;
+
+//TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum  nvinfer1::ActivationType::kRELU
+def PD2TRT_Relu_Lower : Pat<
+        (PD_ReluOp $X),
+        (TRT_ActivationOp $X, (INFRT_createSI32Attr<"0">)/*kRELU*/, (INFRT_createF32Attr<"0.0">), (INFRT_createF32Attr<"0.0">))>;
+
+//TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum  nvinfer1::ActivationType::kCLIP
+def PD2TRT_Relu6_Lower : Pat<
+        (PD_Relu6Op $X, $threshold),
+        (TRT_ActivationOp $X, (INFRT_createSI32Attr<"8">)/*kCLIP*/, (INFRT_createF32Attr<"0.0">), $threshold)>;
+
+#endif // PD_LOWER_TO_TRT
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index 1baef7a3f77fdd9d3e363110ea3679aa942e222f..7af1fa53d12e3113d0fe51e7ba15bbd5c082456c 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -19,6 +19,7 @@
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
 int main(int argc, char** argv) {
@@ -36,9 +37,10 @@ int main(int argc, char** argv) {
   mlir::PassManager pm(context);
 
   mlir::OpPassManager& trt_pass_manager = pm.nest<mlir::FuncOp>();
-  trt_pass_manager.addPass(std::make_unique<infrt::trt::trtOpTellerPass>());
-  trt_pass_manager.addPass(std::make_unique<infrt::trt::trtGraphFusePass>());
-  trt_pass_manager.addPass(std::make_unique<infrt::trt::trtGraphSplitPass>(10));
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpTellerPass>());
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphFusePass>());
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphSplitPass>(1));
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpConverterPass>());
   if (mlir::failed(pm.run(*module))) {
     std::cout << "\npass failed!\n" << std::endl;
     return 4;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index 1da80ef2c3b1000c045327510a03081f8aa954ca..17633a4e8e99293524e5ca635069267e27c2a603 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -142,7 +142,7 @@ void topoSortBlock(mlir::Block &body) {  // NOLINT
 }  // namespace
 
 // Implementation of the trtGraphFusePass.
-void trtGraphFusePass::runOnFunction() {
+void TRTGraphFusePass::runOnFunction() {
   mlir::Block &body = getFunction().front();
   mlir::OpBuilder builder(&body, body.begin());
   bool changed = false;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index f1e555c6f67ecaadff76fb17f68ebaae1a6528e1..ebd7a4ac4bd3712d98df4a097682787b3977ebfb 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -52,8 +52,8 @@ namespace trt {
  *  "pd.fetch" %d, %f
  * }
  */
-class trtGraphFusePass
-    : public mlir::PassWrapper<trtGraphFusePass, mlir::FunctionPass> {
+class TRTGraphFusePass
+    : public mlir::PassWrapper<TRTGraphFusePass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 257f2b528542557db33121a4c304eb8e6f657007..f24b9cc40cdcc2b065ea033cb03638e8d292df89 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -21,7 +21,7 @@
 namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
-void trtGraphSplitPass::runOnFunction() {
+void TRTGraphSplitPass::runOnFunction() {
   std::vector<mlir::pd::GraphOp> worklist;
   mlir::Block& block = getFunction().front();
   for (auto& op : block) {
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index d30d186647fc32aa4e16047000ee4071effb900d..51f84227243403f5a2299d820acad1b49592abc3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -45,12 +45,12 @@ namespace trt {
  *  "pd.fetch" (%d, %f)
  * }
  */
-class trtGraphSplitPass
-    : public mlir::PassWrapper<trtGraphSplitPass, mlir::FunctionPass> {
+class TRTGraphSplitPass
+    : public mlir::PassWrapper<TRTGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
   void runOnFunction() override;
-  explicit trtGraphSplitPass(size_t min_subgraph_size = 3)
+  explicit TRTGraphSplitPass(size_t min_subgraph_size = 3)
       : min_subgraph_size_(min_subgraph_size) {}
 
  private:
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e34308a2f0fa8c3c0142a62324f00c29b61fd7d3
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "paddle/infrt/dialect/infrt_base.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+
+namespace infrt {
+namespace trt {
+
+#include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT
+
+using namespace mlir;
+
+void TRTOpConverterPass::runOnOperation() {
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering.
+  ConversionTarget target(getContext());
+
+  // We define the specific operations, or dialects, that are legal targets for
+  // this lowering. In our case, we are lowering to TensorRTDialect from
+  // PaddleDialect
+  target.addLegalDialect<TensorRTDialect>();
+
+  // Now that the conversion target has been defined, we just need to provide
+  // the set of patterns that will lower the TensorRT operations.
+  RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+
+  // With the target and rewrite patterns defined, we can now attempt the
+  // conversion. The conversion will signal failure if any of our `illegal`
+  // operations were not converted successfully.
+  if (failed(
+          applyPartialConversion(getOperation(), target, std::move(patterns))))
+    signalPassFailure();
+}
+
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..0adbf11b89144b0a9e14dc158e2eab1c56e2563a
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "mlir/IR/Dialect.h"
+#include "mlir/Pass/Pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+
+namespace infrt {
+namespace trt {
+/*
+ * trtOpConverterPass.
+ *
+ * source ir:
+ * func @main() -> tensor<?xf32> {
+ *   %a = "pd.feed"()...
+ *   %d, %f = "pd.graph"(%a) {
+ *     %m = "pd.conv2d"(%a)...
+ *     %n = "pd.conv3d"(%m)...
+ *     %s = "pd.conv2d"(%a)...
+ *     "pd.return" %n, %s
+ *   } ...
+ *   "pd.fetch" %d, %f
+ * }
+ *
+ * destination ir:
+ * func @main() -> tensor<?xf32> {
+ *   %a = "pd.feed"()...
+ *   %d, %f = "pd.graph"(%a) {
+ *     %m = "trt.Convolution"(%a)...
+ *     %n = "trt.Convolution"(%m)...
+ *     %s = "trt.Convolution"(%a)...
+ *     "pd.return" %n, %s
+ *   } ...
+ *   "pd.fetch" %d, %f
+ * }
+ */
+struct TRTOpConverterPass
+    : public mlir::PassWrapper<TRTOpConverterPass,
+                               mlir::OperationPass<mlir::FuncOp>> {
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<TensorRTDialect>();
+  }
+  ::llvm::StringRef getName() const override { return "trtOpConverterPass"; }
+  void runOnOperation() final;
+};
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 4e8d40b982b2eaf13aeef4f026d783c3f353c14b..176fdb7a2e054ac2e0c952c7af27995cf8e3c433 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -20,7 +20,7 @@
 namespace infrt {
 namespace trt {
 // Implementation of the trtOpTellerPass。
-void trtOpTellerPass::runOnFunction() {
+void TRTOpTellerPass::runOnFunction() {
   mlir::Block &body = getFunction().front();
   std::vector<mlir::Operation *> worklist;
   worklist.reserve(body.getOperations().size());
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index fb16c974f7fb3f923bdc460d62d8e5b9f628fff9..8b9a16376ce5527b2133c9f2c2ecea928fb4cd8f 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -52,8 +52,8 @@ namespace trt {
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
  */
-class trtOpTellerPass
-    : public mlir::PassWrapper<trtOpTellerPass, mlir::FunctionPass> {
+class TRTOpTellerPass
+    : public mlir::PassWrapper<TRTOpTellerPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index cc072b6e6885bb68df5cf216fe210aded8a6ec6a..8e3dfffff54f13cc6d1f23c3459ed45257082d4f 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -23,8 +23,48 @@ def TRT_GraphOp : TRT_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> {
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
-  
+  let arguments = (ins Variadic<TRT_Tensor>:$inputs);
   let results = (outs Variadic<TRT_Tensor>:$outputs);
 
 }
+
+def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
+  let summary = "TensorRT IActivationLayer";
+  let description = [{
+    
+    TensorRT IActivationLayer.
+    
+  }];
+  let arguments = (ins  TRT_Tensor:$input, SI32Attr:$activation_type,
+                        DefaultValuedAttr<F32Attr, "0.0">:$alpha,
+                        DefaultValuedAttr<F32Attr, "0.0">:$beta);
+
+  let results = (outs TRT_Tensor:$output);
+}
+
+def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
+  let summary = "TensorRT IElementWiseLayer";
+  let description = [{
+    
+    TensorRT IElementWiseLayer.
+    
+  }];
+  let arguments = (ins  TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation);
+
+  let results = (outs TRT_Tensor:$output);
+}
+
+def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
+  let summary = "TensorRT IMatrixMultiplyLayer";
+  let description = [{
+    
+    TensorRT IMatrixMultiplyLayer.
+    
+  }];
+  let arguments = (ins  TRT_Tensor:$input1, BoolAttr:$transpose1,
+                        TRT_Tensor:$input2, BoolAttr:$transpose2);
+
+  let results = (outs TRT_Tensor:$output);
+}
+
 #endif  // TRT_OPS
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc
index c4588d7cf8bab748832865fc3aaab1913f33d11b..f0c4723b49a7906cf5327771e26eb87e8b1248c0 100644
--- a/paddle/infrt/dialect/test_kernels.cc
+++ b/paddle/infrt/dialect/test_kernels.cc
@@ -147,7 +147,7 @@ static mlir::LogicalResult verify(BenchmarkOp op) {
   // Verify that the target benchmark region has exactly one return value.
   auto &region = op.region();
   auto &last_op = region.front().back();
-  if (last_op.getName().getStringRef() != "infrt.return") {
+  if (last_op.getName().getStringRef() != "Infrt.return") {
     return op.emitOpError("missing return statement");
   }
   if (last_op.getNumOperands() != 1) {
diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/test_kernels.td
index 6aa12f252d0144c814e70e57c336a64df47de95b..6e4bc26aa1496dcb4caed83f98fc42dab9e3cce0 100644
--- a/paddle/infrt/dialect/test_kernels.td
+++ b/paddle/infrt/dialect/test_kernels.td
@@ -45,7 +45,7 @@ def BenchmarkOp : Test_Op<"benchmark"> {
          // The following code benchmarks the infrt.add.i32 kernel.
          %x = infrt.add.i32 %c, %c
          // The benchmarked function needs to return exactly one value.
-         infrt.return %x : i32
+         Infrt.return %x : i32
        }
   }];
 
diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir
index 843b12ced21a982b18b5a63f7bbef1d4d24eea16..1a7ea854c9ce469ee5719743287b4ee1b5de9286 100644
--- a/paddle/infrt/external_kernels/basic.mlir
+++ b/paddle/infrt/external_kernels/basic.mlir
@@ -1,7 +1,7 @@
 // CHECK: basic
 func @basic() -> f32 {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
   %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
@@ -17,5 +17,5 @@ func @basic() -> f32 {
   // CHECK: 6
   "external.print.f32"(%v3) : (f32) -> ()
 
-  infrt.return %v3 : f32
+  Infrt.return %v3 : f32
 }
diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir
index bdac9ded2ef65dd4a09830b69838cb67863f1823..b0cabddc3ebc4a9ede73d506ac58acaa140f03d5 100644
--- a/paddle/infrt/external_kernels/fc.mlir
+++ b/paddle/infrt/external_kernels/fc.mlir
@@ -1,43 +1,43 @@
 // CHECK-LABEL: @fc
-func @fc(%input : !infrt.tensor<X86, NCHW, F32>,
-         %w : !infrt.tensor<X86, NCHW, F32>,
-         %bias : !infrt.tensor<X86, NCHW, F32>) -> !infrt.tensor<X86, NCHW, F32>
+func @fc(%input : !Infrt.tensor<X86, NCHW, F32>,
+         %w : !Infrt.tensor<X86, NCHW, F32>,
+         %bias : !Infrt.tensor<X86, NCHW, F32>) -> !Infrt.tensor<X86, NCHW, F32>
 {
-  %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
-  // dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
+  // dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
 
   // fc1
-  "external.matmul"(%input, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
 
   // fc2
-  "external.matmul"(%out, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%out, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
 
-  infrt.return %out : !infrt.tensor<X86, NCHW, F32>
+  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
 }
 
 // CHECK-LABEL: @benchmark
 func @benchmark() {
-  %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [50, 50] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
 
-  infrt.benchmark "add.f32"(
-          %input:!infrt.tensor<X86, NCHW, F32>,
-          %w:!infrt.tensor<X86, NCHW, F32>,
-          %bias:!infrt.tensor<X86, NCHW, F32>)
+  Infrt.benchmark "add.f32"(
+          %input:!Infrt.tensor<X86, NCHW, F32>,
+          %w:!Infrt.tensor<X86, NCHW, F32>,
+          %bias:!Infrt.tensor<X86, NCHW, F32>)
           duration_secs = 100, max_count = 300000, num_warmup_runs = 3
   {
-    %res = infrt.call @fc(%input, %w, %bias) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>)
-    infrt.return %res : !infrt.tensor<X86, NCHW, F32>
+    %res = Infrt.call @fc(%input, %w, %bias) : (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> (!Infrt.tensor<X86, NCHW, F32>)
+    Infrt.return %res : !Infrt.tensor<X86, NCHW, F32>
   }
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir
index e7b8e9efba838bded2fe86d901422fca7005e507..d55d9904b5bc4e43388abacf9e4b62bf06db458b 100644
--- a/paddle/infrt/external_kernels/paddle.mlir
+++ b/paddle/infrt/external_kernels/paddle.mlir
@@ -1,50 +1,50 @@
 // CHECK: paddle_func
 func @paddle_func() -> () {
-  %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 5] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [5, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
 
-  %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
 
-  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
   // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%input : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%input : !Infrt.tensor<X86, NCHW, F32>)
   // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-  dt.print_tensor (%w : !infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%bias : !infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%w : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%bias : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
 
   // test external.matmul
-  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out1 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.matmul"(%input, %w, %out1) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out1 : !infrt.tensor<X86, NCHW, F32>)
+  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out1 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.matmul"(%input, %w, %out1) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out1 : !Infrt.tensor<X86, NCHW, F32>)
 
   // test external.elementwise_add
-  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out2 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
-  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out2 : !infrt.tensor<X86, NCHW, F32>)
+  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out2 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias1 : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out2 : !Infrt.tensor<X86, NCHW, F32>)
 
   // test external.relu
-  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out3 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.relu"(%out1, %out3) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out3 : !infrt.tensor<X86, NCHW, F32>)
+  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out3 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.relu"(%out1, %out3) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out3 : !Infrt.tensor<X86, NCHW, F32>)
 
   // test external.sigmoid
-  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out4 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.sigmoid"(%out1, %out4) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out4 : !infrt.tensor<X86, NCHW, F32>)
+  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out4 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.sigmoid"(%out1, %out4) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out4 : !Infrt.tensor<X86, NCHW, F32>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc
index 1acb35e898308a96fa53bc39c484f93887d70668..14e88be4b96bb58df87db3191db8bae444c4cc3d 100644
--- a/paddle/infrt/host_context/kernel_frame.cc
+++ b/paddle/infrt/host_context/kernel_frame.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/host_context/kernel_frame.h"
 
 #include <memory>
+#include <sstream>
 
 namespace infrt {
 namespace host_context {
@@ -25,5 +26,36 @@ std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) {
   return os;
 }
 
+#ifndef NDEBUG
+std::string KernelFrame::DumpArgTypes() const {
+  std::stringstream ss;
+  for (auto* value : GetValues(0, GetNumElements())) {
+    if (value->is_type<bool>()) {
+      ss << "bool (" << &value->get<bool>() << "), ";
+    } else if (value->is_type<tensor::DenseHostTensor>()) {
+      ss << "DenseHostTensor(" << &value->get<tensor::DenseHostTensor>()
+         << "), ";
+    } else if (value->is_type<float>()) {
+      ss << "float(" << &value->get<float>() << "), ";
+    } else if (value->is_type<int>()) {
+      ss << "int(" << &value->get<int>() << "), ";
+    } else if (value->is_type<phi::DenseTensor>()) {
+      ss << "phi::DenseTensor(" << &value->get<phi::DenseTensor>() << "), ";
+    } else if (value->is_type<phi::MetaTensor>()) {
+      ss << "phi::MetaTensor(" << &value->get<phi::MetaTensor>() << "), ";
+    } else if (value->is_type<::phi::CPUContext>()) {
+      ss << "phi::CPUContext(" << &value->get<::phi::CPUContext>() << "), ";
+    } else if (value->is_type<host_context::None>()) {
+      ss << "none(" << &value->get<host_context::None>() << "), ";
+    } else if (value->is_type<backends::CpuPhiContext>()) {
+      ss << "CpuPhiContext(" << &value->get<backends::CpuPhiContext>() << "), ";
+    } else {
+      ss << "typeid: " << value->index() << ", ";
+    }
+  }
+  return ss.str();
+}
+#endif
+
 }  // namespace host_context
 }  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h
index 35527872e624f74209e470de24653faa7bd778c3..90887edb991660083e9a6649658d40e96f1642af 100644
--- a/paddle/infrt/host_context/kernel_frame.h
+++ b/paddle/infrt/host_context/kernel_frame.h
@@ -31,20 +31,24 @@ namespace host_context {
 class KernelFrame {
  public:
   int GetNumArgs() const { return num_arguments_; }
-  int GetNumResults() const { return num_results_ == -1 ? 0 : num_results_; }
-  int GetNumAttributes() const {
-    return value_or_attrs_.size() - num_arguments_ -
-           (num_results_ == -1 ? 0 : num_results_);
+  int GetNumResults() const {
+    return value_or_attrs_.size() - num_arguments_ - GetNumAttributes();
   }
+  int GetNumAttributes() const { return num_attrs_ == -1 ? 0 : num_attrs_; }
 
   //! Get something at a specific position \p index. The element might be an
   //! argument, an attribute or a result.
   template <typename T>
   T& GetElementAt(int index) {
-    CHECK_LT(index, GetNumArgs() + GetNumAttributes() + GetNumResults());
+    CHECK_LT(static_cast<size_t>(index), GetNumElements());
     return value_or_attrs_[index]->template get_or_default<T>();
   }
 
+  Value* GetElementAt(int index) {
+    CHECK_LT(static_cast<size_t>(index), GetNumElements());
+    return value_or_attrs_[index];
+  }
+
   // Get number of elements, either input, attributes or results.
   size_t GetNumElements() const { return value_or_attrs_.size(); }
 
@@ -70,18 +74,21 @@ class KernelFrame {
   }
 
   Value* GetAttributeAt(int idx) {
-    CHECK_NE(num_results_, -1)
-        << "Must call SetNumResults before GetAttributeAt";
-    CHECK_LT(idx,
-             static_cast<int>(value_or_attrs_.size() - num_arguments_ -
-                              num_results_));
-    return value_or_attrs_[num_arguments_ + num_results_ + idx];
+    // CHECK_NE(num_results_, -1)
+    //<< "Must call SetNumResults before GetAttributeAt";
+    CHECK_LT(idx, GetNumAttributes());
+    return value_or_attrs_[num_arguments_ + idx];
   }
 
   void AddAttribute(Value* v) {
-    CHECK_NE(num_results_, -1)
-        << "Must call SetNumResults before calling AddAttribute";
+    CHECK_LE(num_results_, 0)
+        << "Must call SetNumResults after calling AddAttribute";
     value_or_attrs_.emplace_back(v);
+    if (num_attrs_ == -1) num_attrs_ = 0;
+    num_attrs_++;
+
+    CHECK_EQ(value_or_attrs_.size(),
+             static_cast<size_t>(num_arguments_ + num_attrs_));
   }
 
   template <typename T, typename... Args>
@@ -96,35 +103,43 @@ class KernelFrame {
 
   template <typename T>
   void SetResultAt(int index, T&& value) {
-    CHECK_LT(index, num_results_) << "Invalid result index";
-    CHECK(value_or_attrs_[num_arguments_ + index]);
-    value_or_attrs_[num_arguments_ + index]->set(std::move(value));
+    CHECK_LT(index, GetNumResults()) << "Invalid result index";
+    CHECK(value_or_attrs_[num_arguments_ + GetNumAttributes() + index]);
+    value_or_attrs_[num_arguments_ + GetNumAttributes() + index]->set(
+        std::move(value));
   }
 
   llvm::ArrayRef<Value*> GetResults() const {
-    return GetValues(num_arguments_, num_results_);
+    CHECK_GE(num_results_, 0) << "Invalid results num";
+    return GetValues(num_arguments_ + GetNumAttributes(), num_results_);
   }
   llvm::MutableArrayRef<Value*> GetResults() {
-    return GetMutableValues(num_arguments_, num_results_);
+    CHECK_GE(num_results_, 0) << "Invalid results num";
+    return GetMutableValues(num_arguments_ + GetNumAttributes(), num_results_);
   }
 
   llvm::ArrayRef<Value*> GetValues(size_t from, size_t length) const {
-    CHECK_LE(static_cast<int>(from + length), num_arguments_ + num_results_);
+    CHECK_LE(from + length, GetNumElements());
     if (length == 0) return {};
 
     return llvm::makeArrayRef(&value_or_attrs_[from], length);
   }
 
   llvm::MutableArrayRef<Value*> GetMutableValues(size_t from, size_t length) {
-    CHECK_LE(static_cast<int>(from + length), num_arguments_ + num_results_);
+    CHECK_LE(from + length, GetNumElements());
     if (length == 0) return {};
     return llvm::makeMutableArrayRef(&value_or_attrs_[from], length);
   }
 
+#ifndef NDEBUG
+  std::string DumpArgTypes() const;
+#endif
+
   bool IsEmpty() const { return value_or_attrs_.empty(); }
 
  protected:
   int num_arguments_{};
+  int num_attrs_{-1};
   int num_results_{-1};
 
   llvm::SmallVector<Value*, 8> value_or_attrs_;
@@ -136,15 +151,15 @@ class KernelFrameBuilder : public KernelFrame {
  public:
   void AddArgument(Value* value) {
     CHECK(value);
-    CHECK_EQ(num_results_, -1)
-        << "Should call AddArgument before calling SetNumResults";
+    CHECK_EQ(num_attrs_, -1)
+        << "Should call AddArgument before calling SetAttributes";
     value_or_attrs_.push_back(value);
     ++num_arguments_;
   }
 
   void SetResults(llvm::ArrayRef<Value*> values) {
-    CHECK_EQ(num_arguments_, static_cast<int>(value_or_attrs_.size()));
-    CHECK_EQ(num_results_, -1);
+    CHECK_EQ(num_arguments_ + GetNumAttributes(),
+             static_cast<int>(value_or_attrs_.size()));
     for (Value* x : values) {
       value_or_attrs_.push_back(x);
     }
@@ -152,28 +167,30 @@ class KernelFrameBuilder : public KernelFrame {
   }
 
   void SetNumResults(size_t n) {
-    CHECK_EQ(num_arguments_, static_cast<int>(value_or_attrs_.size()));
-    CHECK_EQ(num_results_, -1);
-    num_results_ = n;
+    CHECK_EQ(num_arguments_ + GetNumAttributes(),
+             static_cast<int>(value_or_attrs_.size()));
     for (size_t i = 0; i < n; i++) {
       value_or_attrs_.emplace_back(new Value);
     }
+    num_results_ = n;
   }
 
   void SetResultAt(int result_id, Value* value) {
     CHECK_EQ(static_cast<int>(value_or_attrs_.size()),
-             num_arguments_ + num_results_)
+             num_arguments_ + GetNumAttributes() + num_results_)
         << "Call SetNumResults first";
-    CHECK_LT(result_id + num_arguments_,
+    CHECK_LT(result_id + num_arguments_ + GetNumAttributes(),
              static_cast<int>(value_or_attrs_.size()));
     CHECK(value);
-    value_or_attrs_[num_arguments_ + result_id]->set(value);
+    value_or_attrs_[num_arguments_ + GetNumAttributes() + result_id]->set(
+        value);
   }
 
   void Reset() {
     value_or_attrs_.clear();
     num_arguments_ = 0;
     num_results_ = -1;
+    num_attrs_ = -1;
   }
 };
 
diff --git a/paddle/infrt/host_context/kernel_utils.h b/paddle/infrt/host_context/kernel_utils.h
index 31d411006d2378eb77d254c76baf25809c79bb42..2f630dcc213cb6f46b7e48c5210124c3324a874a 100644
--- a/paddle/infrt/host_context/kernel_utils.h
+++ b/paddle/infrt/host_context/kernel_utils.h
@@ -209,9 +209,11 @@ struct KernelImpl<Return (*)(Args...), impl_fn> {
     static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
       static_assert(out_idx != -1,
                     "Do not place Results after RemainingResults");
-      static_assert(const_idx == 0,
-                    "Arguments and results should appear before attributes");
-      Result<Head> arg(&frame->GetResults()[out_idx]);
+      // static_assert(const_idx == 0,
+      //              "Arguments and results should appear before attributes");
+
+      // Result<Head> arg(&frame->GetResults()[out_idx]);
+      Result<Head> arg(new ValueRef());
       KernelCallHelper<
           Tail...>::template Invoke<in_idx, out_idx + 1, const_idx>(frame,
                                                                     pargs...,
@@ -224,8 +226,8 @@ struct KernelImpl<Return (*)(Args...), impl_fn> {
   struct KernelCallHelper<Attribute<Head>, Tail...> {
     template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
     static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
-      static_assert(const_idx != -1,
-                    "Do not place Attributes after RemainingAttributes");
+      // static_assert(const_idx != -1,
+      //              "Do not place Attributes after RemainingAttributes");
       Attribute<Head> arg(frame->GetAttributeAt(const_idx));
       KernelCallHelper<
           Tail...>::template Invoke<in_idx, out_idx, const_idx + 1>(frame,
@@ -242,8 +244,8 @@ struct KernelImpl<Return (*)(Args...), impl_fn> {
       static_assert(in_idx != -1,
                     "Do not place Arguments after RemainingArguments");
       static_assert(out_idx == 0, "Arguments should appear before results");
-      static_assert(const_idx == 0,
-                    "Arguments and results should appear before attributes.");
+      // static_assert(const_idx == 0,
+      //              "Arguments and results should appear before attributes.");
       auto* arg = &frame->template GetElementAt<Head>(in_idx);
       KernelCallHelper<
           Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
@@ -265,7 +267,7 @@ struct KernelImpl<Return (*)(Args...), impl_fn> {
       static_assert(const_idx == 0,
                     "Arguments and results should appear before attributes.");
 
-      auto* value = frame->GetArgAt(in_idx);
+      auto* value = frame->GetElementAt(in_idx);
       auto&& arg = value->get<ArgT>();
 
       KernelCallHelper<
diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc
index bebd8d86e50bbd6a2d80325f9fbd8254718c8d0a..71d8904eb798fbe638ea5a5e1af3824db94c4357 100644
--- a/paddle/infrt/host_context/kernel_utils_test.cc
+++ b/paddle/infrt/host_context/kernel_utils_test.cc
@@ -67,5 +67,45 @@ TEST(KernelImpl, pair) {
   ASSERT_EQ(results[1]->get<float>(), 3.f);
 }
 
+void TestFunc(const std::string& arg_0,
+              const std::string& arg_1,
+              const std::string& arg_2,
+              Attribute<std::string> attr_0,
+              Result<std::string> res_0,
+              Result<std::string> res_1) {
+  CHECK_EQ(arg_0, "arg_0");
+  CHECK_EQ(arg_1, "arg_1");
+  CHECK_EQ(arg_2, "arg_2");
+  CHECK_EQ(attr_0.get(), "attr_0");
+
+  // res_0.Set(Argument<std::string>(ValueRef(new Value())));
+  // res_1.Set(Argument<std::string>(ValueRef(new Value())));
+}
+
+TEST(KernelRegistry, basic) {
+  KernelFrameBuilder kernel_frame;
+
+  Value arg_0(std::string{"arg_0"});
+  Value arg_1(std::string{"arg_1"});
+  Value arg_2(std::string{"arg_2"});
+  Value attr_0(std::string{"attr_0"});
+
+  kernel_frame.AddArgument(&arg_0);
+  kernel_frame.AddArgument(&arg_1);
+  kernel_frame.AddArgument(&arg_2);
+  kernel_frame.AddAttribute(&attr_0);
+  kernel_frame.SetNumResults(2);
+
+  CHECK_EQ(kernel_frame.GetNumArgs(), 3);
+  CHECK_EQ(kernel_frame.GetNumResults(), 2);
+  CHECK_EQ(kernel_frame.GetNumAttributes(), 1);
+  CHECK_EQ(kernel_frame.GetNumElements(), 6UL);
+
+  CHECK_EQ(kernel_frame.GetArgAt<std::string>(2), "arg_2");
+  CHECK_EQ(kernel_frame.GetAttributeAt(0)->get<std::string>(), "attr_0");
+
+  KernelImpl<decltype(&TestFunc), TestFunc>::Invoke(&kernel_frame);
+}
+
 }  // namespace host_context
 }  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 62c907bc9159f4b3ee8e03878736fb30106c4616..79717ba2cc034650726f9e88c9dc31f1f1349c66 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -28,8 +28,8 @@
 #include "paddle/infrt/kernel/tensor_kernels.h"
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
-#ifdef INFRT_WITH_PTEN
-#include "paddle/infrt/kernel/pten/registry.h"
+#ifdef INFRT_WITH_PHI
+#include "paddle/infrt/kernel/phi/registry.h"
 #endif
 
 static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
@@ -56,8 +56,8 @@ int main(int argc, char** argv) {
   kernel::RegisterTensorShapeKernels(&registry);
   kernel::RegisterTensorKernels(&registry);
   kernel::RegisterControlFlowKernels(&registry);
-#ifdef INFRT_WITH_PTEN
-  kernel::RegisterPtenKernels(&registry);
+#ifdef INFRT_WITH_PHI
+  kernel::RegisterPhiKernels(&registry);
 #endif
 
   // load extra shared library
diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir
index 263d5884134b143aa8d3403c5cd05672df39636f..1b55b408f2b082c09d06d51037e8c9d967a171f4 100644
--- a/paddle/infrt/host_context/mlir_tests/basic.mlir
+++ b/paddle/infrt/host_context/mlir_tests/basic.mlir
@@ -1,30 +1,30 @@
 // CHECK-LABEL: basic
 func @basic() -> f32 {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
-  "infrt.print.f32"(%v0) : (f32) -> ()
+  "Infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 2
-  "infrt.print.f32"(%v1) : (f32) -> ()
+  "Infrt.print.f32"(%v1) : (f32) -> ()
 
   // CHECK: 3
-  "infrt.print.f32"(%v2) : (f32) -> ()
+  "Infrt.print.f32"(%v2) : (f32) -> ()
 
-  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
   // CHECK: 6
-  "infrt.print.f32"(%v3) : (f32) -> ()
+  "Infrt.print.f32"(%v3) : (f32) -> ()
 
-  infrt.return %v3 : f32
+  Infrt.return %v3 : f32
 }
 
 // CHECK-LABEL: basic1
 // Check the mlir executor can work with more than one function in a file.
 func @basic1() -> () {
-  %v0 = infrt.constant.f32 1.0
-  "infrt.print.f32"(%v0) : (f32) -> ()
+  %v0 = Infrt.constant.f32 1.0
+  "Infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 1
-  infrt.return
+  Infrt.return
 }
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
index 83afa1db8a91c03f1b22c5b6728e398ed361b472..5a973a3eb23e6015ede2d69d83ab8c26de669908 100644
--- a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
+++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
@@ -1,9 +1,9 @@
 // CHECK-LABEL: build_tensor1
 func @build_tensor1() {
-  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%a : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !Infrt.tensor<X86, NCHW, F32>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir
index a3130857b0ef7d50821a20cfbc9138aaecc74ff7..22df1c8010d8dbd6a4b8e332e01602b4421ebcdd 100644
--- a/paddle/infrt/host_context/mlir_tests/shape.mlir
+++ b/paddle/infrt/host_context/mlir_tests/shape.mlir
@@ -3,5 +3,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  infrt.return
+  Infrt.return
 }
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 3dbc7a702be38d986b6f77b345abe85f939370e6..17e6f7cb563d25186f9a76de8fe67af2ddb90e7b 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -31,6 +31,7 @@
 
 #include "boost/optional.hpp"
 #include "paddle/infrt/common/string.h"
+#include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 #include "paddle/infrt/host_context/core_runtime.h"
@@ -74,7 +75,7 @@ struct MlirToRuntimeTranslator::Impl {
 };
 
 bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
-  if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant"))
+  if (!infrt::Startswith(op->getName().getStringRef().str(), "Infrt.constant"))
     return false;
   VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str()
           << "]";
@@ -150,6 +151,17 @@ boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
   return boost::none;
 }
 
+template <>
+boost::optional<bool> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::BoolAttr>()) return boost::none;
+  if (attr.isa<mlir::BoolAttr>()) {
+    auto val = attr.cast<mlir::BoolAttr>();
+    return val.getValue();
+  }
+  return boost::none;
+}
+
 template <>
 boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
@@ -187,6 +199,7 @@ boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
     return res;                                                                \
   }
 
+PROCESS_ARRAY_INT(bool, 1);
 PROCESS_ARRAY_INT(int16_t, 16);
 PROCESS_ARRAY_INT(int32_t, 32);
 PROCESS_ARRAY_INT(int64_t, 64);
@@ -224,7 +237,7 @@ boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
 }
 
 static bool IsReturn(mlir::Operation* op) {
-  return op->getName().getStringRef() == "infrt.return";
+  return op->getName().getStringRef() == "Infrt.return";
 }
 
 bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
@@ -262,25 +275,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
             << GetValue(operand) << " vs " << arg_value;
   }
 
-  // process results
-  llvm::SmallVector<Value*, 4> res_values;
-  for (int i = 0, e = op->getNumResults(); i < e; i++) {
-    auto res = op->getResult(i);
-    res_values.push_back(AddValue(res));
-
-    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
-  }
-  impl_->cur_op->SetResults(res_values);
-
-#ifdef INFRT_DEBUG
-  {
-    VLOG(3) << "check result";
-    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
-      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
-    }
-  }
-#endif
-
   // process attributes
   auto attrs = op->getAttrs();
 
@@ -296,6 +290,8 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
       impl_->cur_op->AppendAttribute(new Value(*v));
     } else if (auto v = EmitAttribute<std::string>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<bool>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
     } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
@@ -311,6 +307,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
     }
   }
 
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    if (res.getType().isa<::infrt::DenseTensorType>()) {
+      auto r = impl_->value_map.try_emplace(
+          res, ValueRef(new Value{::phi::DenseTensor()}));
+      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
+                      << "]";
+      res_values.push_back(r.first->second.get());
+    } else {
+      res_values.push_back(AddValue(res));
+    }
+
+    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
+  }
+  impl_->cur_op->SetResults(res_values);
+
+#ifdef INFRT_DEBUG
+  {
+    VLOG(3) << "check result";
+    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
+      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
+    }
+  }
+#endif
+
   // process regions, we treat regions as attribute.
   auto num_regions = op->getNumRegions();
   if (num_regions > 0) {
@@ -345,7 +368,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
 bool MlirToRuntimeTranslator::EmitReturnOp(
     mlir::Operation* op, llvm::SmallVectorImpl<mlir::Value>* results) {
   CHECK(results);
-  if (op->getName().getStringRef() == "infrt.return") {
+  if (op->getName().getStringRef() == "Infrt.return") {
     for (size_t i = 0; i < op->getNumOperands(); i++) {
       results->push_back(op->getOperand(i));
     }
@@ -418,7 +441,7 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
                                          function_defs_t* function_table) {
   CHECK(op);
   CHECK(function_table);
-  if (op->getName().getStringRef() != "infrt.call") return false;
+  if (op->getName().getStringRef() != "Infrt.call") return false;
 
   impl_->cur_op =
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
@@ -440,14 +463,6 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
     impl_->cur_op->AppendArgument(arg_value);
   }
 
-  // process results
-  llvm::SmallVector<Value*, 4> res_values;
-  for (int i = 0, e = op->getNumResults(); i < e; i++) {
-    auto res = op->getResult(i);
-    res_values.push_back(AddValue(res));
-  }
-  impl_->cur_op->SetResults(res_values);
-
   // process attribute
   auto& table = function_table ? *function_table : impl_->func_defs;
   {
@@ -460,6 +475,14 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
     impl_->cur_op->AppendAttribute(new Value(function));
   }
 
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    res_values.push_back(AddValue(res));
+  }
+  impl_->cur_op->SetResults(res_values);
+
   VLOG(3) << "Emit call " << callee_name.getValue().str() << " "
           << impl_->cur_op->frame();
   return true;
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
index fcd79eaf386eed5a6a8eaa31712e344bab56dbd4..0c453651d9e6dc44adaf108ec6a1b0df984fe8be 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.h
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -57,7 +57,7 @@ class MlirToRuntimeTranslator {
  protected:
   //! Emit a "infrt.constant.*" operation, return true if succeed.
   bool EmitConstantOp(mlir::Operation* op);
-  //! Emit a "infrt.return" operation.
+  //! Emit a "Infrt.return" operation.
   bool EmitReturnOp(mlir::Operation* op,
                     llvm::SmallVectorImpl<mlir::Value>* results);
   //! Emit a "ts.build_shape" operation.
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
index 375daa4515e17fe1618c71d642825d112a3f788f..5824e40abf97a4d63543948d056e815bbeebce3a 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -37,14 +37,14 @@ TEST(MlirToRuntimeTranslate, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "infrt.print.f32"(%v1) : (f32) -> ()
+  "Infrt.print.f32"(%v1) : (f32) -> ()
 
-  infrt.return
+  Infrt.return
 }
 )ROC";
 
@@ -63,14 +63,14 @@ TEST(TestMlir, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "infrt.print.f32"(%v1) : (f32) -> ()
+  "Infrt.print.f32"(%v1) : (f32) -> ()
 
-  infrt.return
+  Infrt.return
 }
 )ROC";
 
@@ -88,18 +88,20 @@ TEST(TestMlir, shadow_copy_tensor_profile) {
   mlir::MLIRContext* context = infrt::Global::getMLIRContext();
 
   auto head = R"ROC(
-func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
+func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
 )ROC";
 
   auto tpl0 =
-      "%a{0} = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> "
-      "!infrt.tensor<X86, NCHW, F32>";
+      "%a{0} = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, "
+      "NCHW> -> "
+      "!infrt.dense_tensor<CPU, FP32, NCHW>";
   auto tpl1 =
-      "%b{0} = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> "
-      "!infrt.tensor<X86, NCHW, F32>";
+      "%b{0} = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, "
+      "NCHW> -> "
+      "!infrt.dense_tensor<CPU, FP32, NCHW>";
 
   auto end = R"ROC(
-infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
   )ROC";
 
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
index cf40d7315c6a58e4c9cca5e2be4fe2a24922d0ac..59a73e71083286b81f2bbdfa20a4ed96a8353a2f 100644
--- a/paddle/infrt/host_context/op_executable.cc
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -133,7 +133,8 @@ void OpExecutable::Execute() {
   VLOG(3) << "execute " << name()
           << " --- frame args: " << impl_->frame.GetNumArgs() << " results "
           << impl_->frame.GetNumResults() << " attributes "
-          << impl_->frame.GetNumAttributes();
+          << impl_->frame.GetNumAttributes() << "\n"
+          << frame().DumpArgTypes();
   for (int i = 0; i < impl_->frame.GetNumArgs(); i++) {
     VLOG(3) << "function arg: " << impl_->frame.GetArgAt(i);
   }
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
index 222c5dcd6c57550b273bb4d29fa5290c46ec1cf9..3f40490557290fcc34a188882c4d4d251f4ba16e 100644
--- a/paddle/infrt/host_context/value.cc
+++ b/paddle/infrt/host_context/value.cc
@@ -24,7 +24,7 @@ ValueRef::ValueRef(int64_t val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(float val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(double val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(bool val) : Shared<Value>(new Value(val)) {}
-ValueRef::ValueRef(backends::CpuPtenContext&& val)
+ValueRef::ValueRef(backends::CpuPhiContext&& val)
     : Shared<Value>(new Value(std::move(val))) {}
 ValueRef::ValueRef(::phi::CPUContext&& val)
     : Shared<Value>(new Value(std::move(val))) {}
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index c39ddf69a90e2735db2081bdf0b49bfa1ec50b2e..eb9a2092657aa079ee6a4007d7ded9f8896e93aa 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -29,9 +29,9 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-#ifdef INFRT_WITH_PTEN
-#include "paddle/infrt/backends/host/pten_allocator.h"
-#include "paddle/infrt/backends/host/pten_context.h"
+#ifdef INFRT_WITH_PHI
+#include "paddle/infrt/backends/host/phi_allocator.h"
+#include "paddle/infrt/backends/host/phi_context.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
@@ -45,10 +45,13 @@
 namespace infrt {
 namespace host_context {
 
+struct None {};
+
 struct MlirFunctionExecutable;
 
 using ValueVariantType =
-    Variant<int16_t,
+    Variant<None,
+            int16_t,
             int32_t,
             int64_t,
             float,
@@ -61,11 +64,11 @@ using ValueVariantType =
             tensor::DenseHostTensor,
             MlirFunctionExecutable*,
             tensor::TensorMap,
-#ifdef INFRT_WITH_PTEN
+#ifdef INFRT_WITH_PHI
             ::phi::MetaTensor,
             ::phi::DenseTensor,
-            backends::CpuPtenAllocator,
-            backends::CpuPtenContext,
+            backends::CpuPhiAllocator,
+            backends::CpuPhiContext,
             ::phi::CPUContext,
             std::vector<phi::DenseTensor>,
             paddle::experimental::ScalarBase<phi::DenseTensor>,
@@ -108,23 +111,25 @@ class Value : public common::Object {
   explicit Value(tensor::TensorShape&& x) : data(std::move(x)) {}
   explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
   explicit Value(MlirFunctionExecutable* x) : data(x) {}
-#ifdef INFRT_WITH_PTEN
-  explicit Value(backends::CpuPtenContext&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_PHI
+  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
-  explicit Value(backends::CpuPtenAllocator&& x) : data(std::move(x)) {}
+  explicit Value(backends::CpuPhiAllocator&& x) : data(std::move(x)) {}
 #endif
 
   template <typename T>
   const T& get() const {
-    CHECK(data.template is<T>());
+    CHECK(data.template is<T>()) << "typeid: " << data.index()
+                                 << " != " << ValueVariantType::IndexOf<T>;
     return data.get<T>();
   }
 
   template <typename T>
   T& get() {
-    CHECK(data.template is<T>());
+    CHECK(data.template is<T>()) << "typeid: " << data.index()
+                                 << " != " << ValueVariantType::IndexOf<T>;
     return data.get<T>();
   }
 
@@ -153,6 +158,8 @@ class Value : public common::Object {
 
   const char* type_info() const override;
 
+  ValueVariantType::IndexT index() const { return data.index(); }
+
   friend void CopyTo(const Value& from, Value* to);
 
  private:
@@ -173,7 +180,7 @@ class ValueRef : common::Shared<Value> {
   explicit ValueRef(double val);
   explicit ValueRef(bool val);
   explicit ValueRef(::phi::MetaTensor&& val);
-  explicit ValueRef(backends::CpuPtenContext&& x);
+  explicit ValueRef(backends::CpuPhiContext&& x);
   explicit ValueRef(::phi::CPUContext&& x);
   explicit ValueRef(::phi::DenseTensor&& x);
 
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
index 402665119ac2dd93214b5b9733352846004c75b3..f1cbfba1c46b33e461a7c9f08cf646625fbafb24 100644
--- a/paddle/infrt/kernel/CMakeLists.txt
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -1,10 +1,10 @@
-add_subdirectory(pten)
+add_subdirectory(phi)
 
 core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     basic_kernels.cc
-    # pten_kernels.cc
+    # phi_kernels.cc
     test_kernels.cc
     tensor_shape_kernels.cc
     tensor_kernels.cc
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
index b186cfcfd2b355f97711ecc916e497c2916d4060..23e50a5ddc87427bbf0f49c559f185084e42c8ec 100644
--- a/paddle/infrt/kernel/basic_kernels.cc
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -63,24 +63,24 @@ static void PrintString(const std::string &str) {
 void RegisterBasicKernels(host_context::KernelRegistry *registry) {
   RegisterIntBasicKernels(registry);
   RegisterFloatBasicKernels(registry);
-  registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString));
-  registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString));
+  registry->AddKernel("Infrt.get_string", INFRT_KERNEL(GetString));
+  registry->AddKernel("Infrt.print_string", INFRT_KERNEL(PrintString));
 }
 
 void RegisterIntBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add<int32_t>));
-  registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
-  registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
-  registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div<int32_t>));
-  registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print<int32_t>));
+  registry->AddKernel("Infrt.add.i32", INFRT_KERNEL(add<int32_t>));
+  registry->AddKernel("Infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
+  registry->AddKernel("Infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
+  registry->AddKernel("Infrt.div.i32", INFRT_KERNEL(div<int32_t>));
+  registry->AddKernel("Infrt.print.i32", INFRT_KERNEL(print<int32_t>));
 }
 
 void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add<float>));
-  registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub<float>));
-  registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul<float>));
-  registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div<float>));
-  registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
+  registry->AddKernel("Infrt.add.f32", INFRT_KERNEL(add<float>));
+  registry->AddKernel("Infrt.sub.f32", INFRT_KERNEL(sub<float>));
+  registry->AddKernel("Infrt.mul.f32", INFRT_KERNEL(mul<float>));
+  registry->AddKernel("Infrt.div.f32", INFRT_KERNEL(div<float>));
+  registry->AddKernel("Infrt.print.f32", INFRT_KERNEL(print<float>));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc
index 6cc94dbcce0775cb6b74f993bfdd262fd6a47e6f..8b18aca0210860f4ae688f2133ffa022fda3195d 100644
--- a/paddle/infrt/kernel/control_flow_kernels.cc
+++ b/paddle/infrt/kernel/control_flow_kernels.cc
@@ -37,7 +37,7 @@ static void INFRTCall(
 }
 
 void RegisterControlFlowKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall));
+  registry->AddKernel("Infrt.call", INFRT_KERNEL(INFRTCall));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7055c0c06d5905fa738d8df72c7110fdd82a30d2
--- /dev/null
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -0,0 +1,35 @@
+if (NOT INFRT_WITH_PHI)
+    return()
+endif()
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    registry.cc
+    dense_tensor_kernels.cc
+    context_kernels.cc
+    allocator_kernels.cc
+)
+
+set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc)
+set(infrt_register_phi_kernels_gen_file ${CMAKE_SOURCE_DIR}/tools/infrt/get_phi_kernel_function.sh)
+set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h)
+set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc)
+
+add_custom_command(
+        OUTPUT ${infrt_register_phi_kernels_gen_source_file}
+        COMMAND sh ${infrt_register_phi_kernels_gen_file}
+        DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
+        VERBATIM)
+add_custom_target(infrt_register_phi_kernel
+        COMMAND sh ${infrt_register_phi_kernels_gen_file}
+        DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
+        COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}"
+        VERBATIM)
+
+cc_library(infrt_naive SRCS infershaped/infershaped_kernel_launcher.cc
+        infershaped/infershaped_kernel_launchers.cc
+        DEPS phi wrapped_infermeta)
+
+cc_test_tiny(test_infrt_infershape_launchers SRCS
+infershaped/infershape_launchers_test.cc DEPS infrt)
diff --git a/paddle/infrt/kernel/pten/allocator_kernels.cc b/paddle/infrt/kernel/phi/allocator_kernels.cc
similarity index 81%
rename from paddle/infrt/kernel/pten/allocator_kernels.cc
rename to paddle/infrt/kernel/phi/allocator_kernels.cc
index d3ecbed15da9691514b3688006d547ae54c42db0..eba12e688b4ae2cf9bdd4fa46bb479be882b02fc 100644
--- a/paddle/infrt/kernel/pten/allocator_kernels.cc
+++ b/paddle/infrt/kernel/phi/allocator_kernels.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/pten/allocator_kernels.h"
+#include "paddle/infrt/kernel/phi/allocator_kernels.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
-backends::CpuPtenAllocator CreateCpuAllocator() { return {}; }
+backends::CpuPhiAllocator CreateCpuAllocator() { return {}; }
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/allocator_kernels.h b/paddle/infrt/kernel/phi/allocator_kernels.h
similarity index 84%
rename from paddle/infrt/kernel/pten/allocator_kernels.h
rename to paddle/infrt/kernel/phi/allocator_kernels.h
index ddc316c269923e3fc302523e86f64d6233d0c0cf..d10382f5e6014c2b04dab65c8439d99e4563aaef 100644
--- a/paddle/infrt/kernel/pten/allocator_kernels.h
+++ b/paddle/infrt/kernel/phi/allocator_kernels.h
@@ -14,15 +14,15 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/pten_allocator.h"
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
-backends::CpuPtenAllocator CreateCpuAllocator();
+backends::CpuPhiAllocator CreateCpuAllocator();
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
similarity index 82%
rename from paddle/infrt/kernel/pten/context_kernels.cc
rename to paddle/infrt/kernel/phi/context_kernels.cc
index 0c5e53212113be02e3d57471be80bc1564f8f51f..5284f499916c309c03cbada25ab0de44d5549eec 100644
--- a/paddle/infrt/kernel/pten/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/pten/context_kernels.h"
+#include "paddle/infrt/kernel/phi/context_kernels.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
-backends::CpuPtenContext CreateCpuContext() { return {}; }
+::phi::CPUContext CreateCpuContext() { return {}; }
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
similarity index 84%
rename from paddle/infrt/kernel/pten/context_kernels.h
rename to paddle/infrt/kernel/phi/context_kernels.h
index 95a20f912efbf1662cf0c1f474bf5f9295ba5861..8082dc6c2ff2950bdcbc8a99e602b7caab2b6ad7 100644
--- a/paddle/infrt/kernel/pten/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -14,15 +14,15 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/pten_context.h"
+#include "paddle/infrt/backends/host/phi_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
-backends::CpuPtenContext CreateCpuContext();
+::phi::CPUContext CreateCpuContext();
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
similarity index 90%
rename from paddle/infrt/kernel/pten/dense_tensor_kernels.cc
rename to paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index b21e418789663e506cf08307528e693ebfb72e7b..ce9200b9918c0a2cfe2ff80312562375bc3dc23f 100644
--- a/paddle/infrt/kernel/pten/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/pten/dense_tensor_kernels.h"
+#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
 ::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPtenAllocator* allocator,
+    backends::CpuPhiAllocator* allocator,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod) {
   return ::phi::DenseTensor(allocator,
@@ -32,6 +32,6 @@ namespace pten {
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<int64_t>> values) {}
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
similarity index 89%
rename from paddle/infrt/kernel/pten/dense_tensor_kernels.h
rename to paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 41f701b01032acb415852ac03b147cda47bd015a..25daf7027e8cb1371ae40cec7e45b6ef285ef9e5 100644
--- a/paddle/infrt/kernel/pten/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -14,22 +14,22 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/pten_allocator.h"
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
 ::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPtenAllocator* allocator,
+    backends::CpuPhiAllocator* allocator,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<int64_t>> values);
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
similarity index 91%
rename from paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc
rename to paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index c781ca908fdf0d4dec9281f72bdee154611b0c26..2161e98fac8337a766cfcf7eaa27b4486c48dfcb 100644
--- a/paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -14,9 +14,9 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h"
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h"
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_utils.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) {
   host_context::KernelRegistry registry;
   RegisterInferShapeLaunchers(&registry);
   ASSERT_GE(registry.size(), 1UL);
-  auto creator = registry.GetKernel("add.cpu.any.fp32");
+  auto creator = registry.GetKernel("pten.add.cpu.any.fp32");
 
   const phi::DDim dims({1, 2});
   const phi::DataType dtype{phi::DataType::FLOAT32};
diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
similarity index 88%
rename from paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc
rename to paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index c21339bed38727fd5f7eeb124de7959489893bb6..165f7f7c94377f8b9c1f9c240ee1418cab922cdc 100644
--- a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
@@ -26,9 +26,6 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
     if (value->is_type<::phi::DenseTensor>()) {
       values.emplace_back(::phi::MetaTensor{&value->get<::phi::DenseTensor>()});
       infershape_kernel_frame_builder.AddArgument(values.back().get());
-    } else if (value->is_type<phi::DenseTensor>()) {
-      values.emplace_back(phi::MetaTensor{&value->get<phi::DenseTensor>()});
-      infershape_kernel_frame_builder.AddArgument(values.back().get());
     } else {
       infershape_kernel_frame_builder.AddArgument(value);
     }
diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h
similarity index 100%
rename from paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h
rename to paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h
diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h
similarity index 100%
rename from paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h
rename to paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h
diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_utils.h b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
similarity index 100%
rename from paddle/infrt/kernel/pten/infershaped/infershaped_utils.h
rename to paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
diff --git a/paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
similarity index 63%
rename from paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h
rename to paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
index 9a3e978e966b0702ef29623da6578a3858f8cc64..a0a5b391ea669b1358b14098e32750d709e52fe2 100644
--- a/paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h
+++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
@@ -14,14 +14,36 @@
 #pragma once
 
 #include <llvm/ADT/SmallVector.h>
+#include <iostream>
 
+#include "paddle/infrt/backends/host/phi_context.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h"
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_utils.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
 
 namespace infrt {
 namespace kernel {
 
+static void FakePhiInferShape(const ::phi::MetaTensor& a,
+                              const ::phi::MetaTensor& b,
+                              bool arg_0,
+                              bool arg_1,
+                              ::phi::MetaTensor* c) {
+  LOG(INFO) << "the ptr of c: " << c;
+  LOG(INFO) << "c->numel(): " << c->numel();
+}
+
+static void FakePhiKernel(const ::phi::CPUContext& /*Context*/,
+                          const ::phi::DenseTensor& a,
+                          const ::phi::DenseTensor& b,
+                          bool arg_0,
+                          bool arg_1,
+                          ::phi::DenseTensor* c) {
+  std::cout << "@FakePhiKernel@" << std::endl;
+  LOG(INFO) << "the ptr of c: " << c;
+  LOG(INFO) << "c->numel(): " << c->numel();
+}
+
 template <typename KernelFunc,
           KernelFunc kernel,
           typename InferShapedFunc,
@@ -31,10 +53,17 @@ class KernelLauncher : public InferShapedKernelLauncher {
   static const uint16_t num_input_tensors{InferShapeHelper<KernelFunc>::count};
   static const bool turn_on_infer_shape_cache{true};
   void Invoke(host_context::KernelFrame* frame) override {
+#ifndef NDEBUG
+    LOG(INFO) << "Kernel.frame: " << frame->DumpArgTypes();
+#endif
     // Build the infershape KernelFrame if needed.
     // TODO(Superjomn) add unlikely here.
     if (infershape_kernel_frame_builder.IsEmpty()) {
       CreateKernelFrameForInferShape(frame);
+#ifndef NDEBUG
+      LOG(INFO) << "infershape.frame: "
+                << infershape_kernel_frame_builder.DumpArgTypes();
+#endif
     }
     if (turn_on_infer_shape_cache) {
       if (!turn_on_infer_shape_cache || IsShapeChanged(num_input_tensors)) {
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d79814d4bec7fd5a80913f3f3c470e956526c1f
--- /dev/null
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/phi/registry.h"
+
+#include <iostream>
+#include <string>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/kernel/phi/allocator_kernels.h"
+#include "paddle/infrt/kernel/phi/context_kernels.h"
+#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
+#include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h"
+#include "paddle/phi/include/infermeta.h"
+#include "paddle/phi/include/kernels.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+
+using infrt::host_context::Attribute;
+
+namespace infrt {
+namespace kernel {
+
+void RegisterPhiKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("phi_dt.create_allocator.cpu",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuAllocator));
+  registry->AddKernel("phi_dt.create_context.cpu",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuContext));
+  registry->AddKernel(
+      "phi_dt.create_dense_tensor.cpu.f32.nchw",
+      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw));
+  registry->AddKernel("phi_dt.fill_dense_tensor.f32",
+                      INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
+  registry->AddKernel(
+      "phi_dt.fake_phi_kernel",
+      std::bind(&KernelLauncherFunc<decltype(&FakePhiKernel),
+                                    &FakePhiKernel,
+                                    decltype(&FakePhiInferShape),
+                                    &FakePhiInferShape>,
+                KernelLauncher<decltype(&FakePhiKernel),
+                               &FakePhiKernel,
+                               decltype(&FakePhiInferShape),
+                               &FakePhiInferShape>(),
+                std::placeholders::_1));
+}
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/registry.h b/paddle/infrt/kernel/phi/registry.h
similarity index 88%
rename from paddle/infrt/kernel/pten/registry.h
rename to paddle/infrt/kernel/phi/registry.h
index c290f8ea524fb5d5305445ada409bd03844820c5..c72085a50c1e721543c85d1fa40065502dda0091 100644
--- a/paddle/infrt/kernel/pten/registry.h
+++ b/paddle/infrt/kernel/phi/registry.h
@@ -27,9 +27,9 @@ namespace infrt {
 namespace kernel {
 
 /**
- * Register all the pten kernels to registry.
+ * Register all the phi kernels to registry.
  */
-void RegisterPtenKernels(host_context::KernelRegistry* registry);
+void RegisterPhiKernels(host_context::KernelRegistry* registry);
 
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/CMakeLists.txt b/paddle/infrt/kernel/pten/CMakeLists.txt
deleted file mode 100644
index fbb205e2af011e32057349dff3be08409cef68b9..0000000000000000000000000000000000000000
--- a/paddle/infrt/kernel/pten/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-if (NOT INFRT_WITH_PTEN)
-    return()
-endif()
-
-core_gather_headers()
-
-gather_srcs(infrt_src SRCS
-    registry.cc
-    dense_tensor_kernels.cc
-    context_kernels.cc
-    allocator_kernels.cc
-)
-
-set(infrt_register_pten_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc)
-set(infrt_register_pten_kernels_gen_file ${CMAKE_SOURCE_DIR}/tools/infrt/get_pten_kernel_function.sh)
-set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h)
-set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc)
-
-add_custom_command(
-        OUTPUT ${infrt_register_pten_kernels_gen_source_file}
-        COMMAND sh ${infrt_register_pten_kernels_gen_file}
-        DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
-        COMMENT "infrt generate ${infrt_register_pten_kernels_gen_source_file}"
-        VERBATIM)
-
-cc_library(infrt_naive SRCS infershaped/infershaped_kernel_launcher.cc
-        infershaped/infershaped_kernel_launchers.cc
-        DEPS pten wrapped_infermeta)
-
-cc_test_tiny(test_infrt_infershape_launchers SRCS
-infershaped/infershape_launchers_test.cc DEPS infrt)
diff --git a/paddle/infrt/kernel/pten/registry.cc b/paddle/infrt/kernel/pten/registry.cc
deleted file mode 100644
index d70f5deca6aeafa439ce5b19bee78edc46cae368..0000000000000000000000000000000000000000
--- a/paddle/infrt/kernel/pten/registry.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/kernel/pten/registry.h"
-
-#include <iostream>
-#include <string>
-
-#include "paddle/infrt/host_context/kernel_registry.h"
-#include "paddle/infrt/host_context/kernel_utils.h"
-#include "paddle/infrt/kernel/pten/allocator_kernels.h"
-#include "paddle/infrt/kernel/pten/context_kernels.h"
-#include "paddle/infrt/kernel/pten/dense_tensor_kernels.h"
-#include "paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h"
-#include "paddle/phi/include/infermeta.h"
-#include "paddle/phi/include/kernels.h"
-#include "paddle/phi/kernels/matmul_kernel.h"
-
-using infrt::host_context::Attribute;
-
-namespace infrt {
-namespace kernel {
-
-void RegisterPtenKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("pten_dt.create_allocator.cpu",
-                      INFRT_KERNEL(infrt::kernel::pten::CreateCpuAllocator));
-  registry->AddKernel("pten_dt.create_context.cpu",
-                      INFRT_KERNEL(infrt::kernel::pten::CreateCpuContext));
-  registry->AddKernel(
-      "pten_dt.create_dense_tensor.cpu.f32.nchw",
-      INFRT_KERNEL(infrt::kernel::pten::CreateDenseTensorCpuF32Nchw));
-  registry->AddKernel("pten_dt.fill_dense_tensor.f32",
-                      INFRT_KERNEL(infrt::kernel::pten::FillDenseTensorF32));
-  registry->AddKernel(
-      "pten.matmul.host.fp32",
-      std::bind(&kernel::KernelLauncherFunc<
-                    decltype(&::phi::MatmulKernel<float, ::phi::CPUContext>),
-                    &::phi::MatmulKernel<float, ::phi::CPUContext>,
-                    decltype(&::phi::MatmulInferMeta),
-                    &::phi::MatmulInferMeta>,
-                kernel::KernelLauncher<
-                    decltype(&::phi::MatmulKernel<float, ::phi::CPUContext>),
-                    &::phi::MatmulKernel<float, ::phi::CPUContext>,
-                    decltype(&::phi::MatmulInferMeta),
-                    &::phi::MatmulInferMeta>(),
-                std::placeholders::_1));
-}
-
-}  // namespace kernel
-}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 1e55bcd07ae8009cd5ca26ccf565ac3036ad8d19..9de1350e97d1af31dc18a116ed7cb38bf0d2f4ef 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -45,7 +45,7 @@ void PrintTensor(const DenseHostTensor &tensor) {
 }
 
 template <typename T>
-void FillTensorWithConstant(DenseHostTensor *tensor, Attribute<T> v) {
+void FillTensorWithConstant(Attribute<T> v, DenseHostTensor *tensor) {
   MutableDTArrayView<T>(tensor).Fill(v.get());
 }
 
@@ -53,13 +53,11 @@ TensorMap LoadParams(const std::string &path) {
   return *(infrt::tensor::LoadParams(path));
 }
 
-void TensorMapGetTensor(TensorMap map,
-                        DenseHostTensor *out,
-                        Attribute<std::string> name) {
+DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
   auto it = map.find(name.get());
   CHECK(it != map.end()) << "No tensor called " << name.get()
                          << " in the TensorMap";
-  *out = *it->second;
+  return *it->second;
 }
 
 int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index ccfb3356a855f418f14e42ed8a368f31d2fe8b27..d15bbe221f91a87b047863121f32699175183c54 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -193,8 +193,8 @@ tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) {
 }
 
 void RegisterTestKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark));
-  registry->AddKernel("infrt.test.shadow_copy_tensor",
+  registry->AddKernel("Infrt.benchmark", INFRT_KERNEL(benchmark));
+  registry->AddKernel("Infrt.test.shadow_copy_tensor",
                       INFRT_KERNEL(ShadowCopyTensor));
 }
 
diff --git a/paddle/infrt/pass/CMakeLists.txt b/paddle/infrt/pass/CMakeLists.txt
new file mode 100755
index 0000000000000000000000000000000000000000..51fecdf907798eb7280a17b294a263fe40993fe2
--- /dev/null
+++ b/paddle/infrt/pass/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(phi)
diff --git a/paddle/infrt/support/variant.h b/paddle/infrt/support/variant.h
index 2f415b21c80109f92193db155130a43f3f95557a..b8dcd21ae27fef48811a8e12fda995f687dd828c 100644
--- a/paddle/infrt/support/variant.h
+++ b/paddle/infrt/support/variant.h
@@ -136,12 +136,12 @@ class Variant {
     return nullptr;
   }
 
-  IndexT index() { return index_; }
+  IndexT index() const { return index_; }
 
- private:
   template <typename T>
   static constexpr size_t IndexOf = TupleIndexOf<T, Types>::value;
 
+ private:
   static constexpr size_t kStorageSize = std::max({sizeof(Ts)...});
   static constexpr size_t kAlignment = std::max({alignof(Ts)...});
 
diff --git a/paddle/infrt/tests/dialect/basic.mlir b/paddle/infrt/tests/dialect/basic.mlir
index 3c76b438a0ebaf253d4971c71dc82749a05c3083..2d4d6f2629ec7df989499f0a2e9649c01ae8428a 100644
--- a/paddle/infrt/tests/dialect/basic.mlir
+++ b/paddle/infrt/tests/dialect/basic.mlir
@@ -1,41 +1,33 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: @basic_f32
 func @basic_f32() -> f32 {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK-NEXT: 3
-  "infrt.print.f32"(%value) : (f32) -> ()
+  "Infrt.print.f32"(%value) : (f32) -> ()
 
-  infrt.return %value : f32
+  Infrt.return %value : f32
 }
 
 /// ================================================================
 /// @caller call the other function @callee
 func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 {
-  %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32
-  %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
-  infrt.return %z1 : f32
+  %z = "Infrt.add.f32"(%x, %y) : (f32, f32) -> f32
+  %z1 = "Infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
+  Infrt.return %z1 : f32
 }
 
 // CHECK-LABEL: @caller.add.f32
 func @caller.add.f32() -> f32 {
-  %x = infrt.constant.f32 1.0
-  %y = infrt.constant.f32 2.0
-  %y1 = infrt.constant.f32 3.0
-  %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
+  %x = Infrt.constant.f32 1.0
+  %y = Infrt.constant.f32 2.0
+  %y1 = Infrt.constant.f32 3.0
+  %z = Infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
 
   // CHECK-NEXT: 6
-  "infrt.print.f32"(%z) : (f32) -> ()
-  infrt.return %z : f32
+  "Infrt.print.f32"(%z) : (f32) -> ()
+  Infrt.return %z : f32
 }
 /// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-// CHECK-LABEL: @string_test
-func @string_test() {
-  %path = infrt.get_string("this is get_string op.")
-  // CHECK-LABEL: string = this is get_string op.
-  infrt.print_string(%path)
-  infrt.return
-}
diff --git a/paddle/infrt/tests/dialect/benchmark.mlir b/paddle/infrt/tests/dialect/benchmark.mlir
index 1a57b43499062410b346b38412a533d3edd6fbcc..381fd534f6a5a09e3091203de88ebf00101074af 100644
--- a/paddle/infrt/tests/dialect/benchmark.mlir
+++ b/paddle/infrt/tests/dialect/benchmark.mlir
@@ -12,13 +12,13 @@ func @benchmark() {
   // CHECK-LABEL: BM:add.f32:CPU 95%(ns)
   // CHECK-LABEL: BM:add.f32:CPU 99%(ns)
   // CHECK-LABEL: BM:add.f32:CPU utilization(percent)
-  infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
+  Infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
   {
-    %0 = infrt.constant.f32 1.0
-    %1 = infrt.constant.f32 2.0
-    %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32
-    "infrt.print.f32"(%res) : (f32) -> ()
-    infrt.return %res : f32
+    %0 = Infrt.constant.f32 1.0
+    %1 = Infrt.constant.f32 2.0
+    %res = "Infrt.add.f32"(%0, %1) : (f32, f32) -> f32
+    "Infrt.print.f32"(%res) : (f32) -> ()
+    Infrt.return %res : f32
   }
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/dense_tensor.mlir b/paddle/infrt/tests/dialect/dense_tensor.mlir
index f1def17aa87961d70322ec20b4a86a018250e58d..faade62d35063b1d85c4c1d3ddad98b085a7726c 100644
--- a/paddle/infrt/tests/dialect/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/dense_tensor.mlir
@@ -2,23 +2,23 @@
 // CHECK-LABEL: dense_shape0
 func @dense_shape0() {
   %shape = ts.build_shape [1:i64, 57:i64]
-  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  infrt.return
+  Infrt.return
 }
 
-func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
-  %a0 = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
-  %b0 = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
+func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
+  %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
 func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
-  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = infrt.call @predict(%a, %a) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>)
-  infrt.return
+  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
index 111c01c9a108bacb0a72ed5e6ff2044487552642..8e2d3bc49b96c645fc72e33af6300307d855e5a4 100644
--- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
+++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
@@ -1,31 +1,31 @@
 // CHECK-LABEL: @predict
-func @predict(%input:!infrt.tensor<X86, NCHW, F32>, %map: !infrt.tensor_map) -> (!infrt.tensor<X86, NCHW, F32>) {
-  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.tensor<X86, NCHW, F32>
-  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.tensor<X86, NCHW, F32>
+func @predict(%input:!Infrt.tensor<X86, NCHW, F32>, %map: !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>) {
+  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !Infrt.tensor<X86, NCHW, F32>
+  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !Infrt.tensor<X86, NCHW, F32>
 
-  %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor<X86, NCHW, F32>
+  %out = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
 
   // fc
-  "external.matmul"(%input, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  //dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  //dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
 
-  infrt.return %out : !infrt.tensor<X86, NCHW, F32>
+  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
 }
 
 // CHECK-LABEL: @main
 func @main() {
-  %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
 
-  %path = infrt.get_string("/infrt/build/paddle/paddle_1.8_fc_model")
+  %path = Infrt.get_string("/Infrt/build/paddle/paddle_1.8_fc_model")
   // CHECK-LABEL: loading params
   %map = dt.load_params(%path)
 
-  %out = infrt.call @predict(%input, %map): (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor_map) -> (!infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+  %out = Infrt.call @predict(%input, %map): (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
 
-  infrt.return
+  Infrt.return
 }
 
diff --git a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
index d98f107bab41e959d82acfd681d762d7981eab51..b59cfb04816974cbdb923e6d18af1184be963c59 100644
--- a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
@@ -7,15 +7,15 @@ func @main() -> tensor<?xf32> {
   %bias1 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
   %bias2 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
 
-  %d = "pd.elementwise_add"(%c, %bias) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
 
   %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
 
   %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
   "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir
index 02511b21e4792bb37c416093a7c272090eae44c1..48ee4b9d725c0aa36d4849c2842c99997de5c8ee 100644
--- a/paddle/infrt/tests/dialect/paddle_ops.mlir
+++ b/paddle/infrt/tests/dialect/paddle_ops.mlir
@@ -3,8 +3,7 @@
 func @ops() {
   %a = pd.feed() {name="input0"} : tensor<?xf32>
   %b = pd.feed() {name="input1"}: tensor<?xf32>
-  %d = pd.feed() {name="input3"}: !Infrt.lod_tensor<3x4x9xf32, 0>
+  %d = pd.feed() {name="input3"}: !infrt.lod_tensor<3x4x9xf32, 0>
   %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
index 88f5b289fd9f843803fddf0cd98859839ef271de..f0b0b849b93cb1d42ce172c2cff90a41741c1d3d 100644
--- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
@@ -1,11 +1,13 @@
-// RUN: infrtopt %s | FileCheck %s
+// RUN: infrtexec -i %s | FileCheck %s
 
-// CHECK-LABEL: @basic_tensor
-func @basic_tensor() {
-  %a = "pten_dt.create_allocator.cpu" (): () -> !pten.CPU_allocator
-  %b = "pten_dt.create_context.cpu" (): () -> !pten.CPU_context
-  %c = "pten_dt.create_dense_tensor.cpu.f32.nchw" (%a) {dims=[1:i64], lod=[1:i64]}: (!pten.CPU_allocator) -> (!infrt.tensor<X86, NCHW, F32>)
-  // "pten_dt.fill_dense_tensor.f32" (%c) {value=[1.0:f32]} : (!infrt.tensor<X86, NCHW, F32>) -> ()
+// CHECK-LABEL: @fake_phi_kernel_execute
+func @fake_phi_kernel_execute() {
+  %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator
+  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context
+  %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  // CHECK: @FakePhiKernel@
+  %d = "phi_dt.fake_phi_kernel" (%ctx, %t, %t) {transpose_x=false, transpose_y=false} : (!phi.CPU_context, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  Infrt.return
 }
+
diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/pten/pten_pass.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..30ff2636ae5a41674883e63ff931629a0d140b84
--- /dev/null
+++ b/paddle/infrt/tests/dialect/pten/pten_pass.mlir
@@ -0,0 +1,10 @@
+// RUN: infrtopt %s | FileCheck %s
+// CHECK-LABEL: @ops
+func @ops() {
+  %a = pd.feed() {name="input0"} : !infrt.lod_tensor<?xf32,0>
+  %b = pd.feed() {name="input1"} : !infrt.lod_tensor<?xf32,0>
+  %d = pd.feed() {name="input3"} : !infrt.lod_tensor<3x4x9xf32, 0>
+  %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor<?xf32,0>, !infrt.lod_tensor<?xf32>) -> tensor<?xf32>
+  %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
+  "pd.fetch"(%h) {name="output"} :(tensor<?xf32>)->()
+}
diff --git a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
index ff7f36f5078d62d7e8713bba226f7271a7a2664b..76ae140dd6cbd741f992315ee35d3e94058d4674 100644
--- a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
@@ -1,23 +1,23 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: dense_shape0
 func @dense_shape0() {
-  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  infrt.return
+  Infrt.return
 }
 
-func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
-  %a0 = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
-  %b0 = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
+func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
+  %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
 func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
-  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = infrt.call @predict(%a, %a) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>)
-  infrt.return
+  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
index 914e863db49cca3320c74b11b624e3d7dfe3b6f8..52b296e06cd365fbaa1249108f877dc9f7480ff0 100644
--- a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
+++ b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
@@ -2,34 +2,34 @@
 // CHECK-LABEL: naive_elementwise_add
 func @naive_elementwise_add() {
   // create a
-  %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // create b
-  %b = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%b : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %b = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%b : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
   // get c
-  %c = dt.naive_elementwise_add.f32(%a, %b) {} : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> !infrt.tensor<X86, NCHW, F32>
+  %c = dt.naive_elementwise_add.f32(%a, %b) {} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // CHECK: tensor: shape=shape[2,8], values=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
-  dt.print_tensor (%c : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
 
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: naive_matmul
 func @naive_matmul() {
   // create a
-  %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // create b
-  %b = dt.create_uninit_tensor.f32 [8:i64, 4:i64] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%b : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %b = dt.create_uninit_tensor.f32 [8:i64, 4:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%b : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
   // get c
-  %c = dt.naive_matmul.f32(%a, %b) {} : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> !infrt.tensor<X86, NCHW, F32>
+  %c = dt.naive_matmul.f32(%a, %b) {} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // CHECK: tensor: shape=shape[2,4], values=[16, 16, 16, 16, 16, 16, 16, 16]
-  dt.print_tensor (%c : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
index 4edb918b5a28fdfed2b68b647167f41c90d27d9a..5c1396d47f551618bcdf95ef55c875aa2cb0d684 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
+++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
@@ -1,15 +1,15 @@
 // RUN: infrtexec -i %s | FileCheck %s
 
 func @load_tensor_map() {
-  %path = infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model")
+  %path = Infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model")
   %map = dt.load_params(%path)
   %size = dt.tensor_map_get_size(%map) -> i32
-  infrt.print.i32 %size
+  Infrt.print.i32 %size
 
-  %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // CHECK: tensor: shape=shape[2], values=[0, 0]
-  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
index 09210078b9d7d139f2bc2534acf07e83aa1146bb..5623aef71aa2c33ff0bd3524855c56e9dcab5e9b 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
index 01a2f7df32608ad64d2929b4b24f96cf4e5062c4..e580634055a72eae66196f67c8321c308599a1af 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
@@ -1,10 +1,10 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: test_tensor_type
 func @test_tensor_type() {
-  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor_shape.mlir
index 09210078b9d7d139f2bc2534acf07e83aa1146bb..5623aef71aa2c33ff0bd3524855c56e9dcab5e9b 100644
--- a/paddle/infrt/tests/dialect/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor_type.mlir
index 01a2f7df32608ad64d2929b4b24f96cf4e5062c4..e580634055a72eae66196f67c8321c308599a1af 100644
--- a/paddle/infrt/tests/dialect/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor_type.mlir
@@ -1,10 +1,10 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: test_tensor_type
 func @test_tensor_type() {
-  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index cc95e0bf8fdcc5f97872ef84917ea5910b00980c..7b074d0ebb76d110dc361140bd42f78ef54f224b 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -1,5 +1,5 @@
-# pten auto cmake utils
-include(pten)
+# phi auto cmake utils
+include(phi)
 
 # paddle experimental common components
 add_subdirectory(common)
@@ -23,16 +23,16 @@ add_subdirectory(tools)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
-get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
+set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
+get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 # keep this message for debug, remove it later if needless
-message(STATUS "All standard pten kernels: ${pten_kernels}")
-set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
+message(STATUS "All standard phi kernels: ${phi_kernels}")
+set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
-cc_library(pten DEPS ${PTEN_DEPS})
+cc_library(phi DEPS ${PHI_DEPS})
 
-set(pten_extension_header_file ${CMAKE_CURRENT_SOURCE_DIR}/extension.h CACHE INTERNAL "phi/extension.h file")
-file(WRITE ${pten_extension_header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n")
+set(phi_extension_header_file ${CMAKE_CURRENT_SOURCE_DIR}/extension.h CACHE INTERNAL "phi/extension.h file")
+file(WRITE ${phi_extension_header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n")
 
 # generate inner headers include dir for users
 generate_unify_header(backends)
diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt
index a993cb3ff8041dcaa9734687c0409aaa3e6cebc8..d632db046d15ca73837292a5cb1e44479ab2c6ed 100644
--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_subdirectory(lib)
-cc_library(pten_api SRCS all.cc DEPS pten_function_api pten_bw_function_api manual_api sparse_api)
+cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api)
diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h
index 8d840214092ba9b1d7e6cc351cee1abfc816e7f8..154b84670aaf992833fccf9297d8b16a081e173f 100644
--- a/paddle/phi/api/all.h
+++ b/paddle/phi/api/all.h
@@ -24,13 +24,12 @@ limitations under the License. */
 #endif
 #endif
 
-// new pten apis
+// new phi apis
 #include "paddle/phi/api/include/api.h"
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/include/tensor.h"
 
-// pten common headers
+// phi common headers
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
@@ -41,7 +40,6 @@ limitations under the License. */
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/api/ext/dll_decl.h"
 #include "paddle/phi/api/ext/exception.h"
-#include "paddle/phi/api/ext/op_kernel_info.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/ext/place.h"
 #include "paddle/phi/api/ext/tensor_compat.h"
diff --git a/paddle/phi/api/ext/dispatch.h b/paddle/phi/api/ext/dispatch.h
index 4e5fa879a2cfc759cea753be8db19e116d91669e..6b6d0ae7fe7230263454d0bf08da40e4a793549b 100644
--- a/paddle/phi/api/ext/dispatch.h
+++ b/paddle/phi/api/ext/dispatch.h
@@ -292,7 +292,7 @@ namespace paddle {
                            paddle::experimental::complex128,                   \
                            __VA_ARGS__)                                        \
       default:                                                                 \
-        PADDLE_THROW(paddle::platform::errors::InvalidArgument(                \
+        PADDLE_THROW(phi::errors::InvalidArgument(                             \
             "Invalid enum data type `%d`.", static_cast<int>(__dtype__)));     \
     }                                                                          \
   }()
diff --git a/paddle/phi/api/ext/op_kernel_info.h b/paddle/phi/api/ext/op_kernel_info.h
deleted file mode 100644
index b52b0abe9e745d7a559a4f4752bb9a77e4137245..0000000000000000000000000000000000000000
--- a/paddle/phi/api/ext/op_kernel_info.h
+++ /dev/null
@@ -1,1256 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/phi/api/ext/dll_decl.h"
-#include "paddle/phi/api/ext/exception.h"
-#include "paddle/phi/api/ext/op_meta_info.h"
-#include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/common/scalar_array.h"
-#include "paddle/utils/any.h"
-#include "paddle/utils/small_vector.h"
-
-#include "paddle/phi/common/data_type.h"
-
-/**
- * Custom Kernel Info Define.
- *
- * Used to maintain custom kernel core information before registering.
- * Pten is working on exposing headers, custom kernel depends on them, and
- * we prefer outer users following pten-kernel-function-style and registering
- * macro. So, we have to re-implement some structs or class and functions to
- * make sure users' custom kernel functions can be registered to pten.
- *
- * TODO(Aganlengzi): We should upgrade following pten.
- */
-
-namespace paddle {
-namespace framework {
-class PADDLE_API OpKernelInfoHelper;
-}  // namespace framework
-
-// TODO(Aganlengzi): Simple DeviceContext temporarily for stream getting
-// before phi::DeviceContext is exposed.
-class DeviceContext {
- public:
-  DeviceContext() { stream_ = nullptr; }
-  void set_stream(void* stream) { stream_ = stream; }
-  void* stream() const { return stream_; }
-
- private:
-  void* stream_;
-};
-class CPUContext : public DeviceContext {};
-
-// TODO(Aganlengzi): Use paddle::Tensor before DenseTensor is exposed
-using Tensor = paddle::experimental::Tensor;
-using Scalar = phi::Scalar;
-using ScalarArray = phi::ScalarArray;
-
-// Record custom kernel core information
-// We can not use phi::KernelFn directly, so users' custom kernel function
-// is signatured to `CustomKernelFunc', notice that the first parameter is
-// fixed to `const DeviceContext&'.
-using CustomKernelFunc =
-    void (*)(const DeviceContext& dev_ctx,
-             const std::vector<Tensor>& inputs,
-             const std::vector<std::vector<Tensor>>& vec_inputs,
-             const std::vector<paddle::any>& attrs,
-             std::vector<Tensor*>* outputs,
-             std::vector<std::vector<Tensor*>>* vec_outputs);
-
-////////////////////// Kernel Function (PD_PT_KERNEL) ////////////////////////
-#define PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(device_ctx)           \
-  template <typename... Tail>                                                \
-  struct CustomComputeCallHelper<const device_ctx&, Tail...> {               \
-    template <int dev_ctx_idx,                                               \
-              int in_idx,                                                    \
-              int vec_in_idx,                                                \
-              int attr_idx,                                                  \
-              int out_idx,                                                   \
-              int vec_out_idx,                                               \
-              typename... PreviousArgs>                                      \
-    static void Compute(const DeviceContext& dev_ctx,                        \
-                        const std::vector<Tensor>& inputs,                   \
-                        const std::vector<std::vector<Tensor>>& vec_inputs,  \
-                        const std::vector<paddle::any>& attrs,               \
-                        std::vector<Tensor*>* outputs,                       \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,      \
-                        PreviousArgs... pargs) {                             \
-      static_assert(in_idx == 0,                                             \
-                    "Kernel's DeviceContext should appear before Inputs.");  \
-      static_assert(vec_in_idx == 0,                                         \
-                    "Kernel's DeviceContext should appear before Inputs.");  \
-      static_assert(                                                         \
-          attr_idx == 0,                                                     \
-          "Kernel's DeviceContext should appear before Attributes.");        \
-      static_assert(out_idx == 0,                                            \
-                    "Kernel's DeviceContext should appear before Outputs."); \
-      static_assert(vec_out_idx == 0,                                        \
-                    "Kernel's DeviceContext should appear before Outputs."); \
-      const device_ctx& arg = static_cast<const device_ctx&>(dev_ctx);       \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx + 1,    \
-                                                         in_idx,             \
-                                                         vec_in_idx,         \
-                                                         attr_idx,           \
-                                                         out_idx,            \
-                                                         vec_out_idx>(       \
-          dev_ctx,                                                           \
-          inputs,                                                            \
-          vec_inputs,                                                        \
-          attrs,                                                             \
-          outputs,                                                           \
-          vec_outputs,                                                       \
-          pargs...,                                                          \
-          arg);                                                              \
-    }                                                                        \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)               \
-  template <typename... Tail>                                               \
-  struct CustomComputeCallHelper<const tensor_type&, Tail...> {             \
-    template <int dev_ctx_idx,                                              \
-              int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              int out_idx,                                                  \
-              int vec_out_idx,                                              \
-              typename... PreviousArgs>                                     \
-    static void Compute(const DeviceContext& dev_ctx,                       \
-                        const std::vector<Tensor>& inputs,                  \
-                        const std::vector<std::vector<Tensor>>& vec_inputs, \
-                        const std::vector<paddle::any>& attrs,              \
-                        std::vector<Tensor*>* outputs,                      \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
-                        PreviousArgs... pargs) {                            \
-      static_assert(attr_idx == 0,                                          \
-                    "Kernel's Input should appear before Attributes.");     \
-      static_assert(out_idx == 0,                                           \
-                    "Kernel's Input should appear before Outputs.");        \
-      static_assert(vec_out_idx == 0,                                       \
-                    "Kernel's Input should appear before Outputs.");        \
-      const Tensor& arg = inputs[in_idx];                                   \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,       \
-                                                         in_idx + 1,        \
-                                                         vec_in_idx,        \
-                                                         attr_idx,          \
-                                                         out_idx,           \
-                                                         vec_out_idx>(      \
-          dev_ctx,                                                          \
-          inputs,                                                           \
-          vec_inputs,                                                       \
-          attrs,                                                            \
-          outputs,                                                          \
-          vec_outputs,                                                      \
-          pargs...,                                                         \
-          arg);                                                             \
-    }                                                                       \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
-  template <typename... Tail>                                                \
-  struct CustomComputeCallHelper<const std::vector<tensor_type>&, Tail...> { \
-    template <int dev_ctx_idx,                                               \
-              int in_idx,                                                    \
-              int vec_in_idx,                                                \
-              int attr_idx,                                                  \
-              int out_idx,                                                   \
-              int vec_out_idx,                                               \
-              typename... PreviousArgs>                                      \
-    static void Compute(const DeviceContext& dev_ctx,                        \
-                        const std::vector<Tensor>& inputs,                   \
-                        const std::vector<std::vector<Tensor>>& vec_inputs,  \
-                        const std::vector<paddle::any>& attrs,               \
-                        std::vector<Tensor*>* outputs,                       \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,      \
-                        PreviousArgs... pargs) {                             \
-      static_assert(attr_idx == 0,                                           \
-                    "Kernel's Input should appear before Attributes.");      \
-      static_assert(out_idx == 0,                                            \
-                    "Kernel's Input should appear before Outputs.");         \
-      static_assert(vec_out_idx == 0,                                        \
-                    "Kernel's Input should appear before Outputs.");         \
-      const std::vector<Tensor>& arg = vec_inputs[vec_in_idx];               \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,        \
-                                                         in_idx,             \
-                                                         vec_in_idx + 1,     \
-                                                         attr_idx,           \
-                                                         out_idx,            \
-                                                         vec_out_idx>(       \
-          dev_ctx,                                                           \
-          inputs,                                                            \
-          vec_inputs,                                                        \
-          attrs,                                                             \
-          outputs,                                                           \
-          vec_outputs,                                                       \
-          pargs...,                                                          \
-          arg);                                                              \
-    }                                                                        \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)             \
-  template <typename... Tail>                                               \
-  struct CustomComputeCallHelper<attr_type, Tail...> {                      \
-    template <int dev_ctx_idx,                                              \
-              int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              int out_idx,                                                  \
-              int vec_out_idx,                                              \
-              typename... PreviousArgs>                                     \
-    static void Compute(const DeviceContext& dev_ctx,                       \
-                        const std::vector<Tensor>& inputs,                  \
-                        const std::vector<std::vector<Tensor>>& vec_inputs, \
-                        const std::vector<paddle::any>& attrs,              \
-                        std::vector<Tensor*>* outputs,                      \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
-                        PreviousArgs... pargs) {                            \
-      static_assert(out_idx == 0,                                           \
-                    "Kernel's Attributes should appear before Outputs.");   \
-      static_assert(vec_out_idx == 0,                                       \
-                    "Kernel's Attributes should appear before Outputs.");   \
-      try {                                                                 \
-        attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]);       \
-        return CustomComputeCallHelper<Tail...>::template Compute<          \
-            dev_ctx_idx,                                                    \
-            in_idx,                                                         \
-            vec_in_idx,                                                     \
-            attr_idx + 1,                                                   \
-            out_idx,                                                        \
-            vec_out_idx>(dev_ctx,                                           \
-                         inputs,                                            \
-                         vec_inputs,                                        \
-                         attrs,                                             \
-                         outputs,                                           \
-                         vec_outputs,                                       \
-                         pargs...,                                          \
-                         arg);                                              \
-      } catch (paddle::bad_any_cast&) {                                     \
-        PD_THROW(                                                           \
-            "Attribute cast error in custom operator. Expected " #attr_type \
-            " value.");                                                     \
-      }                                                                     \
-    }                                                                       \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)              \
-  template <typename... Tail>                                               \
-  struct CustomComputeCallHelper<tensor_type*, Tail...> {                   \
-    template <int dev_ctx_idx,                                              \
-              int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              int out_idx,                                                  \
-              int vec_out_idx,                                              \
-              typename... PreviousArgs>                                     \
-    static void Compute(const DeviceContext& dev_ctx,                       \
-                        const std::vector<Tensor>& inputs,                  \
-                        const std::vector<std::vector<Tensor>>& vec_inputs, \
-                        const std::vector<paddle::any>& attrs,              \
-                        std::vector<Tensor*>* outputs,                      \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
-                        PreviousArgs... pargs) {                            \
-      tensor_type* arg = (*outputs)[out_idx];                               \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,       \
-                                                         in_idx,            \
-                                                         vec_in_idx,        \
-                                                         attr_idx,          \
-                                                         out_idx + 1,       \
-                                                         vec_out_idx>(      \
-          dev_ctx,                                                          \
-          inputs,                                                           \
-          vec_inputs,                                                       \
-          attrs,                                                            \
-          outputs,                                                          \
-          vec_outputs,                                                      \
-          pargs...,                                                         \
-          arg);                                                             \
-    }                                                                       \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)        \
-  template <typename... Tail>                                               \
-  struct CustomComputeCallHelper<std::vector<tensor_type*>, Tail...> {      \
-    template <int dev_ctx_idx,                                              \
-              int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              int out_idx,                                                  \
-              int vec_out_idx,                                              \
-              typename... PreviousArgs>                                     \
-    static void Compute(const DeviceContext& dev_ctx,                       \
-                        const std::vector<Tensor>& inputs,                  \
-                        const std::vector<std::vector<Tensor>>& vec_inputs, \
-                        const std::vector<paddle::any>& attrs,              \
-                        std::vector<Tensor*>* outputs,                      \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
-                        PreviousArgs... pargs) {                            \
-      std::vector<tensor_type*> arg = (*vec_outputs)[vec_out_idx];          \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,       \
-                                                         in_idx,            \
-                                                         vec_in_idx,        \
-                                                         attr_idx,          \
-                                                         out_idx,           \
-                                                         vec_out_idx + 1>(  \
-          dev_ctx,                                                          \
-          inputs,                                                           \
-          vec_inputs,                                                       \
-          attrs,                                                            \
-          outputs,                                                          \
-          vec_outputs,                                                      \
-          pargs...,                                                         \
-          arg);                                                             \
-    }                                                                       \
-  }
-
-template <typename T>
-struct PtenTypeTag {};
-
-template <typename F, F f>
-struct CustomKernelFuncImpl;
-
-template <typename Return,
-          typename DevCtx,
-          typename... Args,
-          Return (*impl_fn)(DevCtx, Args...)>
-struct CustomKernelFuncImpl<Return (*)(DevCtx, Args...), impl_fn> {
-  static void Compute(const DeviceContext& dev_ctx,
-                      const std::vector<Tensor>& inputs,
-                      const std::vector<std::vector<Tensor>>& vec_inputs,
-                      const std::vector<paddle::any>& attrs,
-                      std::vector<Tensor*>* outputs,
-                      std::vector<std::vector<Tensor*>>* vec_outputs) {
-    CustomComputeCallHelper<DevCtx, Args..., PtenTypeTag<int>>::
-        template Compute<0, 0, 0, 0, 0, 0>(
-            dev_ctx, inputs, vec_inputs, attrs, outputs, vec_outputs);
-  }
-
-  // NOTE: Tensor in args is paddle::Tensor but not DenseTensor
-  static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) {
-    return impl_fn(static_cast<DevCtx>(dev_ctx), std::forward<Args>(args)...);
-  }
-
- private:
-  template <typename... RemainingArgs>
-  struct CustomComputeCallHelper;
-
-  /* DeviceContext Helpers */
-  PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(CPUContext);
-
-  /* Input Helpers */
-  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(Tensor);
-  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(Tensor);
-
-  /* Attribute Helpers */
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
-
-  /* Output Helpers */
-  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(Tensor);
-  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(Tensor);
-
-  // End: base template
-  template <typename T>
-  struct CustomComputeCallHelper<PtenTypeTag<T>> {
-    template <int dev_ctx_idx,
-              int in_idx,
-              int vec_in_idx,
-              int attr_idx,
-              int out_idx,
-              int vec_out_idx>
-    static void Compute(const DeviceContext& dev_ctx,
-                        const std::vector<Tensor>& inputs,
-                        const std::vector<std::vector<Tensor>>& vec_inputs,
-                        const std::vector<paddle::any>& attrs,
-                        std::vector<Tensor*>* outputs,
-                        std::vector<std::vector<Tensor*>>* vec_outputs,
-                        DevCtx device_ctx,
-                        Args... args) {
-      return impl_fn(device_ctx, args...);
-    }
-  };
-};
-
-#define PD_PT_KERNEL(...) \
-  ::paddle::CustomKernelFuncImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
-
-#define PD_PT_VARIADIC_KERNEL(...)                            \
-  reinterpret_cast<void*>(                                    \
-      &::paddle::CustomKernelFuncImpl<decltype(&__VA_ARGS__), \
-                                      &__VA_ARGS__>::VariadicCompute)
-
-////////////////////// Op Kernel Info depended structs //////////////////////
-// TODO(Aganlengzi): Re-define TensorArgDef and AttributeArgDef temporarily.
-// TensorArgDef follows phi::TensorArgDef in kernel_factory.h, the
-// difference is that custom_kernel needs extra `is_vector' to ensure we can
-// deal with case like vector with only one element.
-struct TensorArgDef {
-  phi::Backend backend;
-  phi::DataLayout layout;
-  phi::DataType dtype;
-  bool is_vector{false};
-
-  TensorArgDef(phi::Backend in_backend,
-               phi::DataLayout in_layout,
-               phi::DataType in_dtype,
-               bool is_vector = false)
-      : backend(in_backend),
-        layout(in_layout),
-        dtype(in_dtype),
-        is_vector(is_vector) {}
-
-  TensorArgDef& SetBackend(phi::Backend in_backend) {
-    backend = in_backend;
-    return *this;
-  }
-
-  TensorArgDef& SetDataLayout(phi::DataLayout in_layout) {
-    layout = in_layout;
-    return *this;
-  }
-
-  TensorArgDef& SetDataType(phi::DataType in_dtype) {
-    dtype = in_dtype;
-    return *this;
-  }
-};
-
-// AttributeArgDef follows phi::AttributeArgDef in kernel_factory.h
-struct AttributeArgDef {
-  std::type_index type_index;
-
-  explicit AttributeArgDef(std::type_index type_index)
-      : type_index(type_index) {}
-};
-
-////////////////////// Op Kernel Info //////////////////////
-// OpKernelInfo stores all info parsed from user kernel function, includes:
-// 0. op_name and kernel key(backend, data_layout and data_type)
-// 1. unified custom kernel function
-// 2. variadic kernel function(use paddle::Tensor)
-// 3. args info and user defined change for specific arg
-class PADDLE_API OpKernelInfo {
- public:
-  explicit OpKernelInfo(const std::string& op_name,
-                        phi::Backend backend,
-                        phi::DataLayout data_layout,
-                        phi::DataType data_type)
-      : op_name_(op_name),
-        backend_(backend),
-        layout_(data_layout),
-        dtype_(data_type) {}
-
-  // format: PD_PT_KERNEL(...)
-  OpKernelInfo& SetKernelFn(CustomKernelFunc&& func);
-  // format: PD_PT_VARIADIC_KERNEL(...)
-  OpKernelInfo& SetVariadicKernelFn(void* func);
-
-  // for Args parsing and storing
-  void AppendInput(phi::Backend backend,
-                   phi::DataLayout layout,
-                   phi::DataType dtype,
-                   bool is_vector = false) {
-    input_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector));
-  }
-
-  void AppendOutput(phi::Backend backend,
-                    phi::DataLayout layout,
-                    phi::DataType dtype,
-                    bool is_vector = false) {
-    output_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector));
-  }
-
-  void AppendAttribute(std::type_index type_index) {
-    attribute_defs_.emplace_back(AttributeArgDef(type_index));
-  }
-
-  // for Args user-def function
-  TensorArgDef& InputAt(size_t idx) { return input_defs_.at(idx); }
-  TensorArgDef& OutputAt(size_t idx) { return output_defs_.at(idx); }
-
-  const phi::Backend& GetBackend() const { return backend_; }
-  const phi::DataLayout& GetDataLayout() const { return layout_; }
-  const phi::DataType& GetDataType() const { return dtype_; }
-
- private:
-  friend class framework::OpKernelInfoHelper;
-
-  // 1. op info
-  std::string op_name_;
-
-  // 2. kernel key info
-  phi::Backend backend_{phi::Backend::UNDEFINED};
-  phi::DataLayout layout_{phi::DataLayout::UNDEFINED};
-  phi::DataType dtype_{phi::DataType::UNDEFINED};
-
-  // 3. args info
-  paddle::SmallVector<TensorArgDef> input_defs_{{}};
-  paddle::SmallVector<TensorArgDef> output_defs_{{}};
-  paddle::SmallVector<AttributeArgDef> attribute_defs_{{}};
-
-  // 4. func info
-  CustomKernelFunc kernel_fn_{nullptr};
-  void* variadic_kernel_fn_{nullptr};
-};
-
-////////////////////// Op Kernel Args Parser //////////////////////
-// Define CustomKernelArgsParseFunctor for args parsing
-// We have to store parsed info into OpKernelInfo before
-// mapping to phi::KernelArgsDef in phi::Kernel
-template <typename Func>
-struct CustomKernelArgsParseFunctor;
-
-template <typename Return_, typename... Args_>
-struct CustomKernelArgsParseFunctor<Return_ (*)(Args_...)> {
-  using Args = std::tuple<Args_...>;
-  enum : std::size_t { Arity = sizeof...(Args_) };
-  using Indices = std::make_index_sequence<Arity>;
-  template <std::size_t Index>
-  using Arg = typename std::tuple_element<Index, Args>::type;
-
-  static void Parse(OpKernelInfo* op_kernel_info) {
-    const phi::Backend& backend = op_kernel_info->GetBackend();
-    const phi::DataLayout& layout = op_kernel_info->GetDataLayout();
-    const phi::DataType& dtype = op_kernel_info->GetDataType();
-
-    auto default_tensor_layout = phi::DataLayout::NCHW;
-    if (layout != phi::DataLayout::ANY) {
-      default_tensor_layout = layout;
-    }
-    auto args_type = ParseArgType(Indices{});
-    for (auto arg_type : args_type) {
-      if (arg_type == std::type_index(typeid(const CPUContext&))) {
-        // do nothing, skip context arg now
-      } else if (arg_type == std::type_index(typeid(const Tensor&))) {
-        op_kernel_info->AppendInput(backend, default_tensor_layout, dtype);
-      } else if (arg_type ==
-                 std::type_index(typeid(const std::vector<Tensor>&))) {
-        op_kernel_info->AppendInput(
-            backend, default_tensor_layout, dtype, true);
-      } else if (arg_type == std::type_index(typeid(Tensor*))) {
-        op_kernel_info->AppendOutput(backend, default_tensor_layout, dtype);
-      } else if (arg_type == std::type_index(typeid(std::vector<Tensor*>))) {
-        op_kernel_info->AppendOutput(
-            backend, default_tensor_layout, dtype, true);
-      } else {
-        op_kernel_info->AppendAttribute(arg_type);
-      }
-    }
-  }
-
- private:
-  template <std::size_t... INDEX>
-  static std::vector<std::type_index> ParseArgType(
-      std::index_sequence<INDEX...>) {
-    return {std::type_index(typeid(Arg<INDEX>))...};
-  }
-};
-
-#define PD_PT_ARGS_PARSE(...) \
-  ::paddle::CustomKernelArgsParseFunctor<decltype(&__VA_ARGS__)>::Parse
-
-//////////////// Op Kernel Info Map /////////////////
-// all user custom kernels information are stored in this map
-class PADDLE_API OpKernelInfoMap {
- public:
-  static OpKernelInfoMap& Instance() {
-    static OpKernelInfoMap g_custom_kernel_info_map;
-    return g_custom_kernel_info_map;
-  }
-
-  std::vector<OpKernelInfo>& operator[](const std::string& name);
-
-  const std::unordered_map<std::string, std::vector<OpKernelInfo>>& GetMap()
-      const;
-
- private:
-  OpKernelInfoMap() = default;
-  std::unordered_map<std::string, std::vector<OpKernelInfo>> map_;
-
-  PD_DISABLE_COPY_AND_ASSIGN(OpKernelInfoMap);
-};
-
-//////////////// Op Kernel Info Builder /////////////////
-// format: PD_PT_ARGS_PARSE(...)
-using CustomKernelArgsParseFn = void (*)(OpKernelInfo* op_kernel_info);
-using CustomKernelArgsDefFn = void (*)(OpKernelInfo* kernel);
-
-class PADDLE_API OpKernelInfoBuilder {
- public:
-  explicit OpKernelInfoBuilder(std::string&& op_name,
-                               phi::Backend backend,
-                               phi::DataLayout data_layout,
-                               phi::DataType data_type);
-
-  OpKernelInfoBuilder& SetKernelFn(CustomKernelFunc func);
-  OpKernelInfoBuilder& SetVariadicKernelFn(void* func);
-  OpKernelInfoBuilder& ArgsParse(CustomKernelArgsParseFn func);
-  OpKernelInfoBuilder& ArgsDef(CustomKernelArgsDefFn func);
-
- private:
-  // op name
-  std::string op_name_;
-
-  // kernel key info
-  phi::Backend backend_{phi::Backend::UNDEFINED};
-  phi::DataLayout layout_{phi::DataLayout::UNDEFINED};
-  phi::DataType dtype_{phi::DataType::UNDEFINED};
-
-  // ref current info ptr
-  OpKernelInfo* info_ptr_;
-};
-/////////////////////// Custom kernel register API /////////////////////////
-// For inference: compile directly with framework
-// Call after PD_REGISTER_KERNEL(...)
-void RegisterAllCustomKernel();
-
-//////////////// Custom kernel register macro /////////////////////
-// Refer to paddle/phi/core/kernel_registry.h, we can not use
-// PT_REGISTER_KERNEL directly, common macros and functions are
-// not ready for custom kernel now.
-// Difference: custom_kernel stores all kernels' info into global
-// g_custom_kernel_info_map before loading and registering into
-// pten kernel management. Only providing PD_REGISTER_KERNEL which
-// supports 2 template arguments.
-
-#define PD_BACKEND(arg__) phi::Backend::arg__
-#define PD_DATALAYOUT(arg__) phi::DataLayout::arg__
-#define PD_DATATYPE(arg__) phi::DataType::arg__
-
-#define PD_NARGS(...) _PD_NARGS((__VA_ARGS__, _PD_RESQ_N()))
-#define _PD_NARGS(...) _PD_ARG_N(__VA_ARGS__)
-#define _PD_ARG_N_EXPAND(                                                     \
-    _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \
-  N
-#define _PD_ARG_N(args) _PD_ARG_N_EXPAND args
-#define _PD_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-#define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2)
-#define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2)
-#define PD_CONCATENATE2(arg1, arg2) arg1##arg2
-
-#define PD_EXPAND(x) x
-
-#ifdef __COUNTER__
-#define PD_ID __COUNTER__
-#else
-#define PD_ID __LINE__
-#endif
-
-#define PD_REGISTER_KERNEL(kernel_name, backend, layout, func, cpp_dtype, ...) \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
-      _reg_custom_kernel_ns_check_##kernel_name##_##backend##_##layout,        \
-      "PD_REGISTER_KERNEL must be called in global namespace.");               \
-  _PD_REGISTER_2TA_KERNEL(                                                     \
-      kernel_name, backend, layout, func, cpp_dtype, ##__VA_ARGS__)
-
-// WIN32 is not supported
-#define _PD_REGISTER_2TA_KERNEL(                                              \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)             \
-  PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, ##__VA_ARGS__); \
-  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(   \
-      ::paddle::OpKernelInfo* kernel);                                        \
-  PD_KERNEL_REGISTRAR_INIT(                                                   \
-      kernel_name,                                                            \
-      backend,                                                                \
-      layout,                                                                 \
-      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,          \
-      meta_kernel_fn,                                                         \
-      cpp_dtype,                                                              \
-      ##__VA_ARGS__);                                                         \
-  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(          \
-      ::paddle::OpKernelInfo* kernel)
-
-#define PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, ...) \
-  _PD_KERNEL_INSTANTIATION(PD_NARGS(cpp_dtype, ##__VA_ARGS__),           \
-                           meta_kernel_fn,                               \
-                           backend,                                      \
-                           cpp_dtype,                                    \
-                           ##__VA_ARGS__)
-
-#define _PD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, cpp_dtype, ...) \
-  PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N)                               \
-  (meta_kernel_fn, backend, cpp_dtype, ##__VA_ARGS__)
-
-#define _PD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>
-#define _PD_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, ##__VA_ARGS__))
-
-#define PD_KERNEL_REGISTRAR_INIT(                                              \
-    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
-  _PD_KERNEL_REGISTRAR_INIT(PD_NARGS(cpp_dtype, ##__VA_ARGS__),                \
-                            kernel_name,                                       \
-                            backend,                                           \
-                            layout,                                            \
-                            args_def_fn,                                       \
-                            meta_kernel_fn,                                    \
-                            cpp_dtype,                                         \
-                            ##__VA_ARGS__)
-
-// clang-format off
-
-/* The =pre-commit always treats this macro into the wrong format,
-  and multi-line macros cannot be skipped with NOLINT.*/
-#define _PD_KERNEL_REGISTRAR_INIT(N,              \
-                                  kernel_name,    \
-                                  backend,        \
-                                  layout,         \
-                                  args_def_fn,    \
-                                  meta_kernel_fn, \
-                                  cpp_dtype,      \
-                                  ...)            \
-  PD_CONCATENATE(_PD_KERNEL_REGISTRAR_INIT_, N) ( \
-    kernel_name,                                  \
-    backend,                                      \
-    layout,                                       \
-    PD_ID,                                        \
-    args_def_fn,                                  \
-    meta_kernel_fn,                               \
-    cpp_dtype,                                    \
-    ##__VA_ARGS__)
-
-// clang-format on
-
-#define _PD_KERNEL_REGISTRAR_INIT_1(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);
-
-#define _PD_KERNEL_REGISTRAR_INIT_2(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_1(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_3(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_2(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_4(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_3(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_5(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_4(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_6(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_5(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_7(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_6(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_8(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_7(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_9(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_8(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_10(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_9(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_11(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_10(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_12(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_11(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_13(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_12(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_14(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_13(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_15(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_14(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-}  // namespace paddle
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index db0c28198e80a863030b740d192ef662be43fba6..c268742fa567bffecb2fd17a773ab56aee019853 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -49,8 +49,6 @@ namespace paddle {
 
 namespace experimental {
 
-class CompatiblePTenTensorUtils;
-
 class AbstractAutogradMeta {
  public:
   // No AbstractAutogradMeta should be created
@@ -59,7 +57,7 @@ class AbstractAutogradMeta {
 
 /**
  * Tensor is the API description of the basic data structure in the
- * [ "Paddle Tensor Operation (pten)" Library ].
+ * [ "Paddle Tensor Operation (phi)" Library ].
  *
  * It is not limited to a simple n-dimensional array.
  * It contains a smart pointer to `TensorImpl`. The data description contained
@@ -366,7 +364,7 @@ class PADDLE_API Tensor final {
   /* Part 5: Data Transform methods */
   /* Alert!!!!: All copy method can only deep copy impl, autograd info only be
    * copied */
-  /* out of pten */
+  /* out of phi */
   /**
    * @brief Copy the current Tensor data to the specified device
    * and return the new Tensor. It's usually used to set the input tensor data.
@@ -476,9 +474,6 @@ class PADDLE_API Tensor final {
 
   /* Part 9: Auto generated Tensor methods */
 
- private:
-  friend class CompatiblePTenTensorUtils;
-
  private:
   /**
    * [ Why use abstract TensorImpl interface here? ]
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 175bf34c0da66fbd4ee8bc8451e5b35334b813ce..5edb83f8c3fc01d198d3f63b64047b9e45cd747b 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -3,11 +3,11 @@ add_subdirectory(utils)
 cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place)
 
 if (WITH_GPU)
-  nv_library(pten_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api)
+  nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce)
 elseif (WITH_ROCM)
-  hip_library(pten_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api)
+  hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce)
 else()
-  cc_library(pten_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api)
+  cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce)
 endif()
 
 set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py)
@@ -83,17 +83,16 @@ add_custom_command(
   DEPENDS ${api_yaml_file} ${wrapped_infermeta_gen_file} ${api_gen_base}
   VERBATIM)
 
-cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor_raw pten_context kernel_factory)
-cc_library(pten_data_transform SRCS data_transform.cc DEPS pten_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
-cc_library(manual_api SRCS manual_api.cc DEPS pten_tensor_raw pten kernel_dispatch pten_data_transform)
+cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
+cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
 
-cc_library(pten_tensor SRCS tensor_method.cc DEPS pten_tensor_raw pten_function_api)
+cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
+cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
+cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
 
-cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
-cc_library(op_kernel_info SRCS op_kernel_info.cc DEPS pten_tensor_raw)
+cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl)
+cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl)
 
-cc_library(sparse_api SRCS sparse_api.cc DEPS pten_tensor pten kernel_dispatch pten_data_transform)
-cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform)
-cc_library(pten_dygraph_api SRCS ${dygraph_api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform)
-cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta pten_data_transform pten_function_api)
-cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS pten)
+cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
diff --git a/paddle/phi/api/lib/manual_api.cc b/paddle/phi/api/lib/api_custom_impl.cc
similarity index 55%
rename from paddle/phi/api/lib/manual_api.cc
rename to paddle/phi/api/lib/api_custom_impl.cc
index e0da15eac39b79f3b8ffde3f4c068d02ce28ae6c..c7400b93fcdc18314318fae9482e1e5e5bfb8aef 100644
--- a/paddle/phi/api/lib/manual_api.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/api/include/manual_api.h"
-
-#include <memory>
-
-#include "glog/logging.h"
+#include "paddle/phi/api/lib/api_custom_impl.h"
 
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/api_utils.h"
@@ -25,82 +21,57 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 
-PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
-#endif
-
-#ifdef PADDLE_WITH_XPU
-PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
-#endif
+#include "glog/logging.h"
 
 namespace paddle {
 namespace experimental {
 
-PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
-  // 1. Get kernel signature and kernel
+Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       "copy", kernel_key);
 
-  VLOG(0) << "to API kernel key: " << kernel_key;
-  VLOG(0) << "to API kernel: " << kernel;
+  VLOG(6) << "copy API kernel key: " << kernel_key;
+  VLOG(6) << "copy API kernel: " << kernel;
 
-  // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = phi::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<phi::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x.get());
-  kernel_context.EmplaceBackAttr(blocking);
-
-  // 4. Prepare outputs & InferMeta
-  auto dense_out = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
-      phi::DenseTensorMeta());
-  phi::MetaTensor meta_out(dense_out.get());
-  phi::UnchangedInferMeta(*dense_x, &meta_out);
-  dense_out->mutable_data(phi::TransToPtenPlace(backend));
-  kernel_context.EmplaceBackOutput(dense_out.get());
+
+  auto dense_x = TensorToDenseTensor(x);
+
   Tensor out;
-  out.set_impl(dense_out);
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
+  phi::MetaTensor meta_out(kernel_out);
+  phi::UnchangedInferMeta(*dense_x, &meta_out);
 
-  // 5. Call kernel
-  kernel(&kernel_context);
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    phi::Place,
+                                    bool,
+                                    phi::DenseTensor*);
+
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+  (*kernel_fn)(
+      *dev_ctx, *dense_x, phi::TransToPhiPlace(backend), blocking, kernel_out);
 
   return out;
 }
 
-PADDLE_API std::vector<Tensor> split(const Tensor& x,
-                                     const ScalarArray& num_or_sections,
-                                     const Scalar& axis) {
-  Backend kernel_backend = Backend::UNDEFINED;
-  DataLayout kernel_layout = DataLayout::UNDEFINED;
-  DataType kernel_data_type = DataType::UNDEFINED;
-
-  if (kernel_backend == Backend::UNDEFINED ||
-      kernel_layout == DataLayout::UNDEFINED ||
-      kernel_data_type == DataType::UNDEFINED) {
-    auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-    if (kernel_backend == Backend::UNDEFINED) {
-      kernel_backend = kernel_key.backend();
-    }
-    if (kernel_layout == DataLayout::UNDEFINED) {
-      kernel_layout = kernel_key.layout();
-    }
-    if (kernel_data_type == DataType::UNDEFINED) {
-      kernel_data_type = kernel_key.dtype();
-    }
-  }
+std::vector<Tensor> split_impl(const Tensor& x,
+                               const ScalarArray& num_or_sections,
+                               const Scalar& axis) {
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
+  Backend kernel_backend = kernel_key.backend();
+  DataLayout kernel_layout = kernel_key.layout();
+  DataType kernel_data_type = kernel_key.dtype();
 
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       "split", {kernel_backend, kernel_layout, kernel_data_type});
@@ -144,7 +115,6 @@ PADDLE_API std::vector<Tensor> split(const Tensor& x,
 
   return out;
 }
+
 }  // namespace experimental
 }  // namespace paddle
-
-PT_REGISTER_API(Utils);
diff --git a/paddle/phi/api/include/manual_api.h b/paddle/phi/api/lib/api_custom_impl.h
similarity index 61%
rename from paddle/phi/api/include/manual_api.h
rename to paddle/phi/api/lib/api_custom_impl.h
index 72d348f33918ce545bc7ecf4517d40756cbb1343..5acb68a3281332565d0b094a37fc8ee38c4904ab 100644
--- a/paddle/phi/api/include/manual_api.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,22 +19,15 @@ limitations under the License. */
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 
-/**
- * This file stores some special APIs that are implemented manually
- * or difficult to automatically generated.
- */
-
 namespace paddle {
 namespace experimental {
 
 // TODO(chenweihang): Replace backend by place when place is ready
-PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking);
+Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking);
 
-// TODO(chentianyu03): Split API has extra logic to calculate the outputs size,
-// api_gen do not support
-PADDLE_API std::vector<Tensor> split(const Tensor& x,
-                                     const ScalarArray& num_or_sections,
-                                     const Scalar& axis);
+std::vector<Tensor> split_impl(const Tensor& x,
+                               const ScalarArray& num_or_sections,
+                               const Scalar& axis);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_declare.h b/paddle/phi/api/lib/api_declare.h
index 650161a933a8cb9ba02d1385eef3c7bd0dc09a08..a5d3578d681b6f20992b4560c4dbca4fcd7089a7 100644
--- a/paddle/phi/api/lib/api_declare.h
+++ b/paddle/phi/api/lib/api_declare.h
@@ -17,6 +17,5 @@ limitations under the License. */
 // api symbols declare, remove in the future
 #include "paddle/phi/api/lib/api_registry.h"
 
-PT_DECLARE_API(Math);
-PT_DECLARE_API(Utils);
-PT_DECLARE_API(SparseApi);
+PD_DECLARE_API(Math);
+PD_DECLARE_API(SparseApi);
diff --git a/paddle/phi/api/lib/api_registry.h b/paddle/phi/api/lib/api_registry.h
index 2812bede8e09ba99577efd69d928d89e8431cf25..212a2f96452f69496d9ca60fdc3c8cdb643b9679 100644
--- a/paddle/phi/api/lib/api_registry.h
+++ b/paddle/phi/api/lib/api_registry.h
@@ -27,7 +27,7 @@ namespace experimental {
 #endif
 
 /**
- * Now there is no module to call pten's API. When compiling, the function
+ * Now there is no module to call phi's API. When compiling, the function
  * implementation will be optimized. Therefore, the symbol will be exposed
  * manually for the time being.
  *
@@ -36,12 +36,12 @@ namespace experimental {
  */
 
 // use to declare symbol
-#define PT_REGISTER_API(name) \
+#define PD_REGISTER_API(name) \
   PADDLE_API int RegisterSymbolsFor##name() { return 0; }
 
-#define PT_DECLARE_API(name)                        \
+#define PD_DECLARE_API(name)                        \
   extern PADDLE_API int RegisterSymbolsFor##name(); \
-  UNUSED static int use_pten_api_##name = RegisterSymbolsFor##name()
+  UNUSED static int use_phi_api_##name = RegisterSymbolsFor##name()
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_utils.h b/paddle/phi/api/lib/api_utils.h
index 948e40cd28d5847bbcfb0c4c0ec8f1f39246d22f..6c1fa97c0f52a697383a3526220cc758d778823d 100644
--- a/paddle/phi/api/lib/api_utils.h
+++ b/paddle/phi/api/lib/api_utils.h
@@ -31,6 +31,14 @@ inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
   return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
 }
 
+inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+    const paddle::optional<Tensor>& tensor) {
+  if (tensor) {
+    return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl());
+  }
+  return nullptr;
+}
+
 inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
     const std::vector<Tensor>& tensors) {
   auto pt_tensors = std::make_unique<std::vector<phi::DenseTensor>>();
@@ -49,12 +57,28 @@ inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
   return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl());
 }
 
+inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor) {
+  if (tensor) {
+    return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl());
+  }
+  return nullptr;
+}
+
 /* ----------------- for infer_meta --------------------- */
 
 inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
   return phi::MetaTensor(tensor);
 }
 
+inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::DenseTensor&>& tensor) {
+  if (tensor) {
+    return {phi::MetaTensor(*tensor)};
+  }
+  return {paddle::none};
+}
+
 inline std::vector<phi::MetaTensor> MakeMetaTensor(
     const std::vector<phi::DenseTensor>& tensors) {
   std::vector<phi::MetaTensor> meta_tensors;
@@ -69,12 +93,20 @@ inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
   return phi::MetaTensor(tensor);
 }
 
+inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::SelectedRows&>& tensor) {
+  if (tensor) {
+    return {phi::MetaTensor(*tensor)};
+  }
+  return {paddle::none};
+}
+
 /* ------------------ for output ----------------------- */
 
 inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto dense_tensor = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPtenPlace(backend)),
+        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
         phi::DenseTensorMeta());
     out->set_impl(dense_tensor);
     return dense_tensor.get();
@@ -88,7 +120,7 @@ inline std::vector<phi::DenseTensor*> SetKernelOutput(
   std::vector<phi::DenseTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
     auto tensor_ptr = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPtenPlace(backend)),
+        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
         phi::DenseTensorMeta());
     results[i] = tensor_ptr.get();
     out->emplace_back();
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 9fd91f398f7f47133bc0b13b632860c531d87995..ae67e2ebb35ccef7fe07ee8c76db33a459b1dfce 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -38,7 +38,7 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
                                const TransformFlag& transform_flag) {
   bool ret = transform_flag.need_trans_backend() &&
              target != Backend::ALL_BACKEND &&
-             !platform::is_same_place(input, phi::TransToPtenPlace(target));
+             !platform::is_same_place(input, phi::TransToPhiPlace(target));
   return ret;
 }
 
@@ -168,10 +168,10 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
           out.place(), target_args_def.backend, transform_flag)) {
     phi::DenseTensor result(
         phi::make_intrusive<paddle::experimental::SharedStorage>(
-            phi::TransToPtenPlace(target_args_def.backend)),
+            phi::TransToPhiPlace(target_args_def.backend)),
         {out.dtype(), out.dims(), out.layout()});
     framework::TransDataDevice(
-        out, phi::TransToPtenPlace(target_args_def.backend), &result);
+        out, phi::TransToPhiPlace(target_args_def.backend), &result);
     out = result;
   }
   return out;
@@ -199,6 +199,16 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
   return std::make_shared<phi::DenseTensor>(out);
 }
 
+std::shared_ptr<phi::DenseTensor> PrepareData(
+    const paddle::optional<Tensor>& input,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag) {
+  if (input) {
+    return PrepareData(*input, target_args_def, transform_flag);
+  }
+  return {nullptr};
+}
+
 std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
     const std::vector<Tensor>& inputs,
     const phi::TensorArgDef& target_args_def,
diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h
index 9942b2f90b03becca5706f773339eb80fd3a4be8..8eb1c4a179aed832bdd7b69dd0112ab46107a718 100644
--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -66,6 +66,11 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag);
 
+std::shared_ptr<phi::DenseTensor> PrepareData(
+    const paddle::optional<Tensor>& input,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag);
+
 std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
     const std::vector<Tensor>& inputs,
     const phi::TensorArgDef& target_args_def,
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index 5251473f3b5c9ab272499436c8a2091725449644..0e3ca1af4967c2bf2ae302ea656a31198d187f01 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -21,7 +21,7 @@ namespace experimental {
 namespace detail {
 
 BackendSet GetTensorBackendSet(const Tensor& t) {
-  BackendSet backend_set(phi::TransToPtenBackend(t.inner_place()));
+  BackendSet backend_set(phi::TransToPhiBackend(t.inner_place()));
   switch (t.layout()) {
     case DataLayout::MKLDNN:
       backend_set = backend_set | BackendSet(Backend::MKLDNN);
@@ -53,7 +53,7 @@ std::size_t CountLeadingZeros(uint64_t val) {
 
 phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
-  return pool.Get(phi::TransToPtenPlace(backend));
+  return pool.Get(phi::TransToPhiPlace(backend));
 }
 
 DataType ParseDataType(DataType dtype) { return dtype; }
@@ -83,7 +83,7 @@ DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) {
 
 Backend ParseBackend(Backend backend) { return backend; }
 Backend ParseBackend(const Tensor& tensor) {
-  return phi::TransToPtenBackend(tensor.inner_place());
+  return phi::TransToPhiBackend(tensor.inner_place());
 }
 
 Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) {
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index ad315ededf5d77a550375855b70bf3927c316941..9a09bc2183ad73857d5afee8909d957e65c5a664 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -51,7 +51,7 @@ struct KernelKeySet {
   DataType dtype{DataType::UNDEFINED};
 
   // TODO(chenweihang): iterate all kernelkey for kernel selection
-  phi::KernelKey GetHigestPriorityKernelKey() {
+  phi::KernelKey GetHighestPriorityKernelKey() {
     return phi::KernelKey(static_cast<Backend>(64 - detail::CountLeadingZeros(
                                                         backend_set.bitset())),
                           layout,
diff --git a/paddle/phi/api/lib/op_kernel_info.cc b/paddle/phi/api/lib/op_kernel_info.cc
deleted file mode 100644
index 78b4955f321da0a3b37cc766287806acd37f37ac..0000000000000000000000000000000000000000
--- a/paddle/phi/api/lib/op_kernel_info.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/api/ext/op_kernel_info.h"
-#include "paddle/fluid/framework/custom_kernel.h"
-
-namespace paddle {
-
-////////////////////// Op Kernel Info //////////////////////
-
-OpKernelInfo& OpKernelInfo::SetKernelFn(CustomKernelFunc&& func) {
-  kernel_fn_ = std::forward<CustomKernelFunc>(func);
-  return *this;
-}
-
-OpKernelInfo& OpKernelInfo::SetVariadicKernelFn(void* func) {
-  variadic_kernel_fn_ = func;
-  return *this;
-}
-
-//////////////// Op Kernel Info Map /////////////////
-
-std::vector<OpKernelInfo>& OpKernelInfoMap::operator[](
-    const std::string& name) {
-  return map_[name];
-}
-
-const std::unordered_map<std::string, std::vector<OpKernelInfo>>&
-OpKernelInfoMap::GetMap() const {
-  return map_;
-}
-
-//////////////// Op Kernel Info Builder /////////////////
-
-OpKernelInfoBuilder::OpKernelInfoBuilder(std::string&& op_name,
-                                         phi::Backend backend,
-                                         phi::DataLayout data_layout,
-                                         phi::DataType data_type) {
-  // 1. member assign
-  op_name_ = std::forward<std::string>(op_name);
-  backend_ = backend;
-  layout_ = data_layout;
-  dtype_ = data_type;
-
-  // 2. info parse
-  auto& info_vector = OpKernelInfoMap::Instance()[op_name_];
-  auto op_kernel_info = OpKernelInfo(op_name_, backend_, layout_, dtype_);
-  info_vector.emplace_back(std::move(op_kernel_info));
-
-  // 3. get current info ptr
-  info_ptr_ = &(info_vector.back());
-}
-
-OpKernelInfoBuilder& OpKernelInfoBuilder::SetKernelFn(CustomKernelFunc func) {
-  info_ptr_->SetKernelFn(std::forward<CustomKernelFunc>(func));
-  return *this;
-}
-
-OpKernelInfoBuilder& OpKernelInfoBuilder::SetVariadicKernelFn(void* func) {
-  info_ptr_->SetVariadicKernelFn(func);
-  return *this;
-}
-
-OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsParse(
-    CustomKernelArgsParseFn func) {
-  func(this->info_ptr_);
-  return *this;
-}
-
-OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsDef(CustomKernelArgsDefFn func) {
-  func(this->info_ptr_);
-  return *this;
-}
-
-/////////////////////// Op register API /////////////////////////
-
-// For inference: compile directly with framework
-// Call after PD_REGISTER_KERNEL(...)
-void RegisterAllCustomKernel() {
-  auto& op_kernel_info_map = OpKernelInfoMap::Instance();
-  framework::RegisterKernelWithMetaInfoMap(op_kernel_info_map);
-}
-
-}  // namespace paddle
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global OpKernelInfoMap.
-paddle::OpKernelInfoMap& PD_GetOpKernelInfoMap() {
-  return paddle::OpKernelInfoMap::Instance();
-}
-
-#ifdef __cplusplus
-}  // end extern "C"
-#endif
diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api.cc
index 5a22d617492d2121de3acdb2e10bcaaa60f78a24..9e1f59c0aa74329b15efcbff123b137fbf0b1360 100644
--- a/paddle/phi/api/lib/sparse_api.cc
+++ b/paddle/phi/api/lib/sparse_api.cc
@@ -22,20 +22,20 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
 
-PT_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
 #endif
 
 namespace paddle {
@@ -51,7 +51,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_coo";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_coo";
@@ -86,11 +86,11 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_indices(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(indices_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(elements_meta));
   auto coo = std::make_shared<phi::SparseCooTensor>(
       non_zero_indices, non_zero_elements, x.dims());
@@ -112,7 +112,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_csr";
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     kernel_name = "sparse_coo_to_csr";
@@ -148,15 +148,15 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_crows(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(crows_meta));
   phi::DenseTensor non_zero_cols(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(cols_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(elements_meta));
   auto csr = std::make_shared<phi::SparseCsrTensor>(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
@@ -179,7 +179,7 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "sparse_coo_to_dense";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_dense";
@@ -211,7 +211,7 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
   // create empty SparseCooTensor
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(dense_meta));
 
   kernel_context.EmplaceBackOutput(dense_out.get());
@@ -228,4 +228,4 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
 }  // namespace experimental
 }  // namespace paddle
 
-PT_REGISTER_API(SparseApi);
+PD_REGISTER_API(SparseApi);
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 963aeec328e2ad2ecf24bb100d3035d3de4251f8..311dd0fc30941d2afb9f1bc1e7ae57f3a449a254 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/api/lib/ext_compat_utils.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/api/lib/utils/storage.h"
@@ -34,7 +33,7 @@ limitations under the License. */
  *
  * We hope to organize the basic implementation of Tensor and the logic related
  * to Tensor computation into an independent library, which we call
- * [Tensor Operation Library, pten], so we extract or rewrite the original
+ * [Tensor Operation Library, phi], so we extract or rewrite the original
  * Kernels.
  *
  * In the future, the training library, inference library and custom operators
@@ -299,72 +298,7 @@ gpuStream_t Tensor::stream() const {
 }
 #endif
 
-/* Part 5: Data Transform methods */
-
-template <typename T>
-Tensor Tensor::copy_to(const PlaceType &target_place) const {
-  LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version "
-                  "2.3, and will be removed in version 2.4, please use "
-                  "`copy_to` method without template argument instead. "
-                  "reason: copying a Tensor to another device does not need "
-                  "to specify the data type template argument.";
-  return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
-}
-
-template PADDLE_API Tensor
-Tensor::copy_to<float>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<double>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<bool>(const PlaceType &target_place) const;
-template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<float>>(
-    const PlaceType &target_place) const;
-template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<double>>(
-    const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<phi::dtype::float16>(const PlaceType &target_place) const;
-
-Tensor Tensor::copy_to(Backend backend, bool blocking) const {
-  return experimental::copy_to(*this, backend, blocking);
-}
-
-void Tensor::copy_(const Tensor &src, bool blocking) {
-  if (!src.is_initialized()) {
-    return;
-  }
-  VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name();
-  if (defined()) {
-    PADDLE_ENFORCE_EQ(dtype(),
-                      src.dtype(),
-                      platform::errors::PreconditionNotMet(
-                          "Tensor %s has different data type with Tensor %s, "
-                          "Tensor Copy cannot be performed!",
-                          name(),
-                          src.name()));
-    PADDLE_ENFORCE_EQ(impl()->type_info().id(),
-                      src.impl()->type_info().id(),
-                      platform::errors::PreconditionNotMet(
-                          "Tensor %s has different type with Tensor %s, Tensor "
-                          "Copy cannot be performed!",
-                          name(),
-                          src.name()));
-  }
-  auto copy_tensor =
-      src.copy_to(phi::TransToPtenBackend(src.inner_place()), blocking);
-  set_impl(copy_tensor.impl());
-}
-
-/* Part 6: Status utils methods */
+/* Part 5: Status utils methods */
 
 bool Tensor::defined() const { return impl_ != nullptr; }
 
@@ -376,7 +310,7 @@ bool Tensor::is_initialized() const {
 
 void Tensor::reset() { impl_.reset(); }
 
-/* Part 7: Operator overloading */
+/* Part 6: Operator overloading */
 
 Tensor &Tensor::operator=(const Tensor &x) & {
   impl_ = x.impl_;
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index b67810d610f2fbd26d46efe7e2e5ff8343d62aab..aefa26952d1e5f224112576bfbd74be80cca72cc 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -14,15 +14,83 @@ limitations under the License. */
 
 #include "paddle/phi/api/include/tensor.h"
 
+#include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/tensor_base.h"
+
 namespace paddle {
 namespace experimental {
 
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
+Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
 
 Tensor Tensor::cast(DataType target_type) const {
   return experimental::cast(*this, target_type);
 }
 
+Tensor Tensor::copy_to(Backend backend, bool blocking) const {
+  return experimental::copy_to(*this, backend, blocking);
+}
+
+template <typename T>
+Tensor Tensor::copy_to(const PlaceType &target_place) const {
+  LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version "
+                  "2.3, and will be removed in version 2.4, please use "
+                  "`copy_to` method without template argument instead. "
+                  "reason: copying a Tensor to another device does not need "
+                  "to specify the data type template argument.";
+  return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
+}
+
+template PADDLE_API Tensor
+Tensor::copy_to<float>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<double>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<bool>(const PlaceType &target_place) const;
+template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<float>>(
+    const PlaceType &target_place) const;
+template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<double>>(
+    const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<phi::dtype::float16>(const PlaceType &target_place) const;
+
+void Tensor::copy_(const Tensor &src, bool blocking) {
+  if (!src.is_initialized()) {
+    return;
+  }
+  VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name();
+  if (defined()) {
+    PADDLE_ENFORCE_EQ(dtype(),
+                      src.dtype(),
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s has different data type with Tensor %s, "
+                          "Tensor Copy cannot be performed!",
+                          name(),
+                          src.name()));
+    PADDLE_ENFORCE_EQ(impl()->type_info().id(),
+                      src.impl()->type_info().id(),
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s has different type with Tensor %s, Tensor "
+                          "Copy cannot be performed!",
+                          name(),
+                          src.name()));
+  }
+  auto copy_tensor =
+      src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking);
+  set_impl(copy_tensor.impl());
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 74ecb3cd65262c3e0598134979c54c02b029d6ee..6d056b54b70058e33501083d9754aa27466c0f59 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS
+cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
 tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits)
diff --git a/paddle/phi/api/lib/utils/storage.cc b/paddle/phi/api/lib/utils/storage.cc
index db3f5f0c8f98bcd4831ba7be69537e9db9efbee2..09ff18d10e312f1f1be130bb2411316dca184458 100644
--- a/paddle/phi/api/lib/utils/storage.cc
+++ b/paddle/phi/api/lib/utils/storage.cc
@@ -19,7 +19,7 @@ namespace experimental {
 
 ExternalStorage::ExternalStorage(void* ptr,
                                  size_t size,
-                                 const paddle::platform::Place& place)
+                                 const phi::Place& place)
     : phi::Storage(std::make_shared<phi::Allocation>(ptr, size, place)),
       size_(size) {}
 
@@ -29,11 +29,11 @@ ExternalStorage::ExternalStorage(const phi::intrusive_ptr<phi::Storage>& root,
     : Storage(std::make_shared<phi::Allocation>(
           static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
       size_(size) {
-  PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),
-                    root->size(),
-                    paddle::platform::errors::InvalidArgument(
-                        "The size of the external storage does "
-                        "not meet the metadata requirements."));
+  PADDLE_ENFORCE_LE(
+      static_cast<size_t>(delta + size),
+      root->size(),
+      phi::errors::InvalidArgument("The size of the external storage does "
+                                   "not meet the metadata requirements."));
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/utils/storage.h b/paddle/phi/api/lib/utils/storage.h
index ede5f804836621a88a294d05cbae6a15c9eceb81..c2eedd0fa63f787d7aff6e5f20d807f363bc8b95 100644
--- a/paddle/phi/api/lib/utils/storage.h
+++ b/paddle/phi/api/lib/utils/storage.h
@@ -30,7 +30,7 @@ class ExternalStorage : public phi::Storage {
   static const char* name() { return "ExternalStorage"; }
 
   void Realloc(size_t n) override {
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "The external shared storage cannot be reallocated."));
   }
 
@@ -55,7 +55,7 @@ class ExternalStorage : public phi::Storage {
   const phi::Place& place() const override {
     PADDLE_ENFORCE_NOT_NULL(
         data_,
-        paddle::platform::errors::Unavailable(
+        phi::errors::Unavailable(
             "Unable to visit place as data_ has not been initialized yet."));
     return data_->place();
   }
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index fc56d201fe3ccc736fdef834e69426e5f0384bf9..31325e22afae31e55a3a2d939739d6745ccd3d36 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -31,13 +31,13 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
   }
 }
 
-std::unique_ptr<phi::DenseTensor> MakePtenDenseTensor(
+std::unique_ptr<phi::DenseTensor> MakePhiDenseTensor(
     const paddle::framework::Tensor& src) {
   return std::make_unique<phi::DenseTensor>(src);
 }
 
-phi::Scalar MakePtenScalarFromVar(const framework::Variable& variable) {
-  auto expected_place = phi::TransToPtenPlace(phi::Backend::CPU);
+phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) {
+  auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
     if (!platform::is_same_place(tensor.place(), expected_place)) {
@@ -55,21 +55,21 @@ phi::Scalar MakePtenScalarFromVar(const framework::Variable& variable) {
   }
 }
 
-phi::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src) {
+phi::ScalarArray MakePhiScalarArray(const paddle::framework::Tensor& src) {
   return {src};
 }
 
-phi::ScalarArray MakePtenScalarArrayFromVar(
+phi::ScalarArray MakePhiScalarArrayFromVar(
     const framework::Variable& variable) {
-  auto expected_place = phi::TransToPtenPlace(phi::Backend::CPU);
+  auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
     if (!platform::is_same_place(tensor.place(), expected_place)) {
       framework::LoDTensor tmp_tensor;
       framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
-      return MakePtenScalarArray(tmp_tensor);
+      return MakePhiScalarArray(tmp_tensor);
     } else {
-      return MakePtenScalarArray(tensor);
+      return MakePhiScalarArray(tensor);
     }
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -80,12 +80,12 @@ phi::ScalarArray MakePtenScalarArrayFromVar(
 }
 
 // TODO(chentianyu03): Inplace with ScalarArray constructor
-phi::ScalarArray MakePtenScalarArrayFromVarList(
+phi::ScalarArray MakePhiScalarArrayFromVarList(
     const std::vector<framework::Variable*>& variable_list) {
   if (variable_list.size() == 0) {
     return phi::ScalarArray();
   }
-  auto expected_place = phi::TransToPtenPlace(phi::Backend::CPU);
+  auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
 
   std::vector<int64_t> vector_data;
   vector_data.reserve(variable_list.size());
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 51aca6a52b41cd59858f3c138423c3debdb40eaf..8b30d5421ab943d568a046ca0fe4698849780ffd 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -30,17 +30,16 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-std::unique_ptr<phi::DenseTensor> MakePtenDenseTensor(
+std::unique_ptr<phi::DenseTensor> MakePhiDenseTensor(
     const paddle::framework::Tensor& src);
 
-phi::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src);
+phi::ScalarArray MakePhiScalarArray(const paddle::framework::Tensor& src);
 
-phi::Scalar MakePtenScalarFromVar(const framework::Variable& variable);
+phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable);
 
-phi::ScalarArray MakePtenScalarArrayFromVar(
-    const framework::Variable& variable);
+phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable);
 
-phi::ScalarArray MakePtenScalarArrayFromVarList(
+phi::ScalarArray MakePhiScalarArrayFromVarList(
     const std::vector<framework::Variable*>& variable_list);
 
 void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 441bd0a8c303b5e45f173f20e78ca2e65b9fc314..43e477ef32e9c2a3d914447d610cd6f07b73a92a 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -12,12 +12,16 @@ if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
 
-cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
+cc_library(phi_context SRCS all_context.cc DEPS device_context cpu_context)
 
 if(WITH_XPU)
-  add_dependencies(pten_context xpu_context)
+  add_dependencies(phi_context xpu_context)
 endif()
 
 if(WITH_GPU)
-  add_dependencies(pten_context gpu_context)
+  add_dependencies(phi_context gpu_context)
+endif()
+
+if(WITH_CUSTOM_DEVICE)
+  add_dependencies(phi_context custom_context)
 endif()
diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h
index b53c5ce5c780cb09bd752de1c27c6ef87776aff2..57e6f084fd4c9a643822ddeb46418b0587cb982e 100644
--- a/paddle/phi/backends/all_context.h
+++ b/paddle/phi/backends/all_context.h
@@ -18,15 +18,18 @@ limitations under the License. */
 // In order to avoid including the header files of each backend in turn,
 // add this header file
 // Note: Limit the entry of DeviceContext to backends to avoid multiple include
-// path replacement after implementing pten DeviceContext
+// path replacement after implementing phi DeviceContext
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 // TODO(wilber): DeviceContextPool nees include fluid file.
 #include "paddle/fluid/platform/device_context.h"
 
 namespace phi {
 using DeviceContextPool = paddle::platform::DeviceContextPool;
 }  // namespace phi
+#endif
diff --git a/paddle/phi/backends/cpu/CMakeLists.txt b/paddle/phi/backends/cpu/CMakeLists.txt
index 965b33f3800edf9597b07ad2446637d2c505fe0f..82ea42566fc1f46a51f4dbf049dcb7470c633c25 100644
--- a/paddle/phi/backends/cpu/CMakeLists.txt
+++ b/paddle/phi/backends/cpu/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_MKLDNN)
   # TODO(wilber): support mkldnn context.
-  cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context mkldnn eigen3)
+  cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context mkldnn eigen3)
 else()
-  cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context eigen3)
+  cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context eigen3)
 endif()
diff --git a/paddle/phi/backends/cpu/cpu_context.h b/paddle/phi/backends/cpu/cpu_context.h
index e67df65850f15545d7da7a21c5edf30c53661b4d..aa14c2a8e3862139b3149bbcdcfa169d7c292377 100644
--- a/paddle/phi/backends/cpu/cpu_context.h
+++ b/paddle/phi/backends/cpu/cpu_context.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/forwards.h"
 #include "paddle/phi/core/device_context.h"
 
-// TODO(wilber): Do we need to use place in pten kernel?
+// TODO(wilber): Do we need to use place in phi kernel?
 #include "paddle/phi/common/place.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt
index 9a7de35dd4e66c687bf501845d7b079f90f42464..cb54d3675687d9ae7145c9ac01bc874e811b08f7 100644
--- a/paddle/phi/backends/custom/CMakeLists.txt
+++ b/paddle/phi/backends/custom/CMakeLists.txt
@@ -1,3 +1,3 @@
 if (WITH_CUSTOM_DEVICE)
-  cc_library(custom_context SRCS custom_context.cc DEPS pten_device_context device_manager)
+  cc_library(custom_context SRCS custom_context.cc DEPS phi_device_context device_manager)
 endif()
diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc
index 445f550839160f79a757b50c74080cf3741aa76f..bde3b6a08539b51e06442ef6090f99cbea7e9de9 100644
--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -32,8 +32,8 @@ struct CustomContext::Impl {
 
   const Place& GetPlace() const { return place_; }
 
-  C_Stream stream() const {
-    return reinterpret_cast<C_Stream>(stream_->raw_stream());
+  void* stream() const {
+    return reinterpret_cast<void*>(stream_->raw_stream());
   }
 
   void Wait() const { stream_->Wait(); }
@@ -47,7 +47,7 @@ void CustomContext::Init() { impl_->Init(); }
 
 const Place& CustomContext::GetPlace() const { return impl_->GetPlace(); }
 
-C_Stream CustomContext::stream() const { return impl_->stream(); }
+void* CustomContext::stream() const { return impl_->stream(); }
 
 void CustomContext::Wait() const { return impl_->Wait(); }
 
diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h
index 109f5e53707f6ed3a04efb5680e6ec42649e13ef..37b0ee21219b59a0a79d748f6cd4ab0bc289440b 100644
--- a/paddle/phi/backends/custom/custom_context.h
+++ b/paddle/phi/backends/custom/custom_context.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
-#include "paddle/fluid/platform/device/device_ext.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
 
@@ -30,7 +29,7 @@ class CustomContext : public DeviceContext {
   const Place& GetPlace() const override;
 
   /*! \brief  Return stream in the device context. */
-  C_Stream stream() const;
+  void* stream() const;
 
   // Wait for all operations completion in the stream.
   void Wait() const override;
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index b7242fc76df7c5db69d58363de6f5427b397aaa6..bc5ef3cd5c078798c5b178df6e7c5341a6fa9d1b 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(pten_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags)
+cc_library(phi_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags)
 
 list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
 
@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
   list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 if(WITH_ROCM)
-  hip_library(pten_dynload_cuda SRCS ${HIP_SRCS} DEPS pten_dynamic_loader)
-  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+  hip_library(phi_dynload_cuda SRCS ${HIP_SRCS} DEPS phi_dynamic_loader)
+  cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc)
 elseif (WITH_ASCEND_CL)
-  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc npu_hccl)
+  cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc npu_hccl)
 else()
-  nv_library(pten_dynload_cuda SRCS ${CUDA_SRCS} DEPS pten_dynamic_loader)
-  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+  nv_library(phi_dynload_cuda SRCS ${CUDA_SRCS} DEPS phi_dynamic_loader)
+  cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc)
 endif()
 if (WITH_MKLML)
-  cc_library(pten_dynload_mklml SRCS mklml.cc DEPS pten_dynamic_loader mklml)
+  cc_library(phi_dynload_mklml SRCS mklml.cc DEPS phi_dynamic_loader mklml)
 endif()
 
-cc_library(pten_dynload_lapack SRCS lapack.cc DEPS pten_dynamic_loader)
-add_dependencies(pten_dynload_lapack extern_lapack)
+cc_library(phi_dynload_lapack SRCS lapack.cc DEPS phi_dynamic_loader)
+add_dependencies(phi_dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
 if (MKL_FOUND AND WITH_ONEMKL)
   message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(pten_dynload_mklrt SRCS mklrt.cc DEPS pten_dynamic_loader)
-  target_include_directories(pten_dynload_mklrt PRIVATE ${MKL_INCLUDE})
+  cc_library(phi_dynload_mklrt SRCS mklrt.cc DEPS phi_dynamic_loader)
+  target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc
index ff000d27c4f2e185c88259e2353e476b1ff9220b..02d626d5f98f9fc0c260a55c846031634b68e144 100644
--- a/paddle/phi/backends/dynload/cudnn.cc
+++ b/paddle/phi/backends/dynload/cudnn.cc
@@ -54,7 +54,7 @@ bool HasCUDNN() {
 void EnforceCUDNNLoaded(const char* fn_name) {
   PADDLE_ENFORCE_NOT_NULL(
       cudnn_dso_handle,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Cannot load cudnn shared library. Cannot invoke method %s.",
           fn_name));
 }
diff --git a/paddle/phi/backends/dynload/cufft.cc b/paddle/phi/backends/dynload/cufft.cc
index 14240af41046c3a735b30392b0ab7685bc3d5806..596a68c1ed6aad96942ddd2b5eee82b8102e2444 100644
--- a/paddle/phi/backends/dynload/cufft.cc
+++ b/paddle/phi/backends/dynload/cufft.cc
@@ -33,7 +33,7 @@ bool HasCUFFT() {
 void EnforceCUFFTLoaded(const char* fn_name) {
   PADDLE_ENFORCE_NOT_NULL(
       cufft_dso_handle,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Cannot load cufft shared library. Cannot invoke method %s.",
           fn_name));
 }
diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
index fe98fa6bd37ef345b73bd85f1384a2574222dcb3..a526fbfd926393701e2ebb076fa9208810d2be26 100644
--- a/paddle/phi/backends/dynload/cupti.h
+++ b/paddle/phi/backends/dynload/cupti.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUPTI
 
 #include <cuda.h>
+#include <cuda_occupancy.h>
 #include <cupti.h>
 #include <mutex>  // NOLINT
 
@@ -63,7 +64,8 @@ extern void *cupti_dso_handle;
   __macro(cuptiSubscribe);                    \
   __macro(cuptiUnsubscribe);                  \
   __macro(cuptiEnableCallback);               \
-  __macro(cuptiEnableDomain);
+  __macro(cuptiEnableDomain);                 \
+  __macro(cudaOccMaxActiveBlocksPerMultiprocessor);
 
 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
 
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 473c58b33eebc46a62b6b31af10d6b71b0fff53d..2f35e22a18f820cd15325d8516447e3652c132f1 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <windows.h>
 #endif
 
-// TODO(wilber): The pten computing library requires a component to manage flags
+// TODO(wilber): The phi computing library requires a component to manage flags
 // (maybe not use gflags).
 #include "gflags/gflags.h"
 #include "glog/logging.h"
@@ -299,8 +299,8 @@ static inline void* GetDsoHandleFromSearchPath(
 #endif  // !_WIN32
     if (throw_on_error) {
       // NOTE: Special error report case, no need to change its format
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          error_msg, dso_name, errorno));
+      PADDLE_THROW(
+          phi::errors::PreconditionNotMet(error_msg, dso_name, errorno));
     } else {
       LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno);
     }
@@ -547,14 +547,11 @@ void* GetOpDsoHandle(const std::string& dso_name) {
 
 void* GetNvtxDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("Nvtx do not support Apple."));
+  PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support Apple."));
 #elif defined(_WIN32)
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("Nvtx do not support Windows."));
+  PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support Windows."));
 #elif !defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(paddle::platform::errors::Unimplemented(
-      "Nvtx do not support without CUDA."));
+  PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support without CUDA."));
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
 #endif
diff --git a/paddle/phi/backends/dynload/miopen.cc b/paddle/phi/backends/dynload/miopen.cc
index a57574dbab13bc88065cb91b9b175f164799584e..e7916873ccfde7e1e5d0933045c9b44557f2f07a 100644
--- a/paddle/phi/backends/dynload/miopen.cc
+++ b/paddle/phi/backends/dynload/miopen.cc
@@ -58,7 +58,7 @@ bool HasCUDNN() {
 void EnforceCUDNNLoaded(const char* fn_name) {
   PADDLE_ENFORCE_NOT_NULL(
       miopen_dso_handle,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Cannot load miopen shared library. Cannot invoke method %s.",
           fn_name));
 }
diff --git a/paddle/phi/backends/dynload/tensorrt.h b/paddle/phi/backends/dynload/tensorrt.h
index 77f25ec0b5aaff99fcaba8cae418d4045dfedf3a..cd8c6457f1b91b938f1ef927119c9ec63a7b6e1b 100644
--- a/paddle/phi/backends/dynload/tensorrt.h
+++ b/paddle/phi/backends/dynload/tensorrt.h
@@ -54,21 +54,21 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                   \
   extern DynLoad__##__name __name
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)              \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      std::call_once(tensorrt_dso_flag, []() {                              \
-        tensorrt_dso_handle = phi::dynload::GetTensorRtHandle();            \
-      });                                                                   \
-      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);        \
-      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                   \
-                              paddle::platform::errors::Unavailable(        \
-                                  "Load tensorrt api %s failed", #__name)); \
-      using tensorrt_func = decltype(&::__name);                            \
-      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);          \
-    }                                                                       \
-  };                                                                        \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)               \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
+      std::call_once(tensorrt_dso_flag, []() {                               \
+        tensorrt_dso_handle = phi::dynload::GetTensorRtHandle();             \
+      });                                                                    \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);         \
+      PADDLE_ENFORCE_NOT_NULL(                                               \
+          p_##__name,                                                        \
+          phi::errors::Unavailable("Load tensorrt api %s failed", #__name)); \
+      using tensorrt_func = decltype(&::__name);                             \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);           \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
@@ -80,7 +80,7 @@ extern void* tensorrt_plugin_dso_handle;
       });                                                                      \
       static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name);    \
       PADDLE_ENFORCE_NOT_NULL(p_##__name,                                      \
-                              paddle::platform::errors::Unavailable(           \
+                              phi::errors::Unavailable(                        \
                                   "Load tensorrt plugin %s failed", #__name)); \
       using tensorrt_plugin_func = decltype(&::__name);                        \
       return reinterpret_cast<tensorrt_plugin_func>(p_##__name)(args...);      \
diff --git a/paddle/phi/backends/gpu/CMakeLists.txt b/paddle/phi/backends/gpu/CMakeLists.txt
index 09591f79ae8fcdfe6430c256dd0defad272a46b3..d14e94024f90fbb00f4ef1ea6963dcc7692924fa 100644
--- a/paddle/phi/backends/gpu/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/CMakeLists.txt
@@ -1,9 +1,9 @@
 if(WITH_GPU)
   add_subdirectory(cuda)
-  nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
+  nv_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_cuda_info gflags glog enforce phi_dynload_cuda)
 elseif(WITH_ROCM)
   add_subdirectory(rocm)
-  hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
+  hip_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda)
 endif()
 
-cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
+cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3)
diff --git a/paddle/phi/backends/gpu/cuda/CMakeLists.txt b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
index 7eb1983a793bcfff6fd43040f006bafbfb5012bd..a3393f97d7559314dabf55444c1d8961705a4f05 100644
--- a/paddle/phi/backends/gpu/cuda/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
@@ -1 +1 @@
-nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
+nv_library(phi_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce phi_dynload_cuda)
diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc
index f8e4ec02bc39e3406437a0503d4cd9622565dbeb..7be21e85f0005b9bfe7849ac6f12561cf108c7e3 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_info.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
-// TODO(pten): remove fluid headers.
+// TODO(phi): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
 
 static std::once_flag g_device_props_size_init_flag;
@@ -74,13 +74,13 @@ int GetGPUDeviceCount() {
 }
 
 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int major, minor;
   auto major_error_code =
       cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
@@ -93,26 +93,26 @@ int GetGPUComputeCapability(int id) {
 }
 
 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int runtime_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
   return runtime_version;
 }
 
 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int driver_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
   return driver_version;
@@ -125,13 +125,13 @@ bool TensorCoreAvailable() {
 }
 
 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
@@ -139,13 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }
 
 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
       &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
@@ -154,13 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }
 
 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
@@ -174,13 +174,13 @@ int GetCurrentDeviceId() {
 }
 
 std::array<int, 3> GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   std::array<int, 3> ret;
   int size;
   auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
@@ -213,7 +213,7 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
   }
 
   if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(paddle::platform::errors::OutOfRange(
+    PADDLE_THROW(phi::errors::OutOfRange(
         "The device id %d is out of range [0, %d), where %d is the number of "
         "devices on this machine. Because the device id should be greater than "
         "or equal to zero and smaller than the number of gpus. Please input "
@@ -233,13 +233,13 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
 }
 
@@ -294,13 +294,13 @@ gpuError_t GpuGetLastError() { return cudaGetLastError(); }
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
 // for more detail about managed memory requirements
 bool IsGPUManagedMemorySupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #if defined(__linux__) || defined(_WIN32)
   int ManagedMemoryAttr;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
@@ -312,13 +312,13 @@ bool IsGPUManagedMemorySupported(int dev_id) {
 }
 
 bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #ifdef __linux__
   return IsGPUManagedMemorySupported(dev_id) &&
          GetGPUComputeCapability(dev_id) >= 60;
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 28057abed542abd2c120d1199dab7ba776929812..dbcc1660c6472cdddaaa3bea72854f61370c19a0 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -49,7 +49,7 @@ limitations under the License. */
 // without eigen.
 #include "unsupported/Eigen/CXX11/Tensor"
 
-// TODO(pten): remove fluid header.
+// TODO(phi): remove fluid header.
 #include "paddle/fluid/platform/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 5fa80d3a577419350845d64e7b8cd44f03bb3847..603ce0817c4ebdcb17bb97b14dd0700badcf2385 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -220,4 +220,11 @@ class GPUContext : public DeviceContext {
   std::unique_ptr<Impl> impl_;
 };
 
+// Note: In order to register the kernel of CUDNN, GPUDNNContext is required.
+// Currently, CUDNN kernel directly uses GPUContext. But if the kernel function
+// has the same name, this will lead to duplicate instantiations of GPU kernel
+// and GPUDNN kernel function, so if we using GPUDNNContext = GPUContext, we
+// must use different function name for cudnn kernel
+using GPUDNNContext = GPUContext;
+
 }  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 21193755044579eb4f19936dca1c2b6b3c5b4bea..e45b465122588263e47d3ccda47c29bb8bf3b6bd 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -29,6 +29,7 @@
 #include <string>
 #include <vector>
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/enforce.h"
 
 #ifdef __HIPCC__
 // HIP results in error or nan if > 256
@@ -100,12 +101,12 @@ struct GpuLaunchConfig {
 inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
                                             int64_t numel,
                                             int vec_size = 1) {
-  PADDLE_ENFORCE_GT(numel,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "element quantity should be greater than 0,"
-                        " but received value is: %d.",
-                        numel));
+  PADDLE_ENFORCE_GT(
+      numel,
+      0,
+      phi::errors::InvalidArgument("element quantity should be greater than 0,"
+                                   " but received value is: %d.",
+                                   numel));
   // Get compute_capability
   const int capability = context.GetComputeCapability();
   /* If thread number per block is 64/128/256/512, cuda performs better.*/
@@ -142,18 +143,18 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
 inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
                                             int x_dim,
                                             int y_dim) {
-  PADDLE_ENFORCE_GT(x_dim,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "x dim number should greater than 0,"
-                        " but received value is: %d",
-                        x_dim));
-  PADDLE_ENFORCE_GT(y_dim,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "y dim number should greater than 0,"
-                        " but received value is: %d",
-                        y_dim));
+  PADDLE_ENFORCE_GT(
+      x_dim,
+      0,
+      phi::errors::InvalidArgument("x dim number should greater than 0,"
+                                   " but received value is: %d",
+                                   x_dim));
+  PADDLE_ENFORCE_GT(
+      y_dim,
+      0,
+      phi::errors::InvalidArgument("y dim number should greater than 0,"
+                                   " but received value is: %d",
+                                   y_dim));
 
   const int kThreadsPerBlock = 256;
   int block_cols = (std::min)(x_dim, kThreadsPerBlock);
diff --git a/paddle/phi/backends/gpu/rocm/CMakeLists.txt b/paddle/phi/backends/gpu/rocm/CMakeLists.txt
index 181f92cbfc31c7f2a407b3f474f3361ae40cea3c..257e4cc8afbcf20966dd377c4945f9a9fa9f8579 100644
--- a/paddle/phi/backends/gpu/rocm/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/rocm/CMakeLists.txt
@@ -1 +1 @@
-hip_library(pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda)
+hip_library(phi_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce phi_dynload_cuda)
diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc
index c7390cfb6a2198904f081ffbb8f5f4f8532324e2..23e58d34b25725c048a39244d27f0afd0a917e0f 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_info.cc
+++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc
@@ -15,7 +15,7 @@
 #include <array>
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
-// TODO(pten): remove fluid headers.
+// TODO(phi): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
 
 static std::once_flag g_device_props_size_init_flag;
@@ -78,13 +78,13 @@ int GetGPUDeviceCount() {
 }
 
 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int major, minor;
   auto major_error_code = hipDeviceGetAttribute(
       &major, hipDeviceAttributeComputeCapabilityMajor, id);
@@ -97,26 +97,26 @@ int GetGPUComputeCapability(int id) {
 }
 
 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int runtime_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
   return runtime_version;
 }
 
 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int driver_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
   return driver_version;
@@ -125,13 +125,13 @@ int GetGPUDriverVersion(int id) {
 bool TensorCoreAvailable() { return false; }
 
 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
@@ -139,13 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }
 
 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
       &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
@@ -154,13 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }
 
 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
@@ -174,13 +174,13 @@ int GetCurrentDeviceId() {
 }
 
 std::array<int, 3> GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   std::array<int, 3> ret;
   int size;
   auto error_code_x =
@@ -216,7 +216,7 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
   }
 
   if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(paddle::platform::errors::OutOfRange(
+    PADDLE_THROW(phi::errors::OutOfRange(
         "The device id %d is out of range [0, %d), where %d is the number of "
         "devices on this machine. Because the device id should be greater than "
         "or equal to zero and smaller than the number of gpus. Please input "
@@ -235,13 +235,13 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
 }
 
@@ -293,13 +293,13 @@ void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
 gpuError_t GpuGetLastError() { return hipGetLastError(); }
 
 bool IsGPUManagedMemorySupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #if defined(__linux__) || defined(_WIN32)
   int ManagedMemoryAttr;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
@@ -311,13 +311,13 @@ bool IsGPUManagedMemorySupported(int dev_id) {
 }
 
 bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #ifdef __linux__
   return IsGPUManagedMemorySupported(dev_id) &&
          GetGPUComputeCapability(dev_id) >= 60;
diff --git a/paddle/phi/backends/xpu/CMakeLists.txt b/paddle/phi/backends/xpu/CMakeLists.txt
index 65341dd206fd30c318eb72cb74c4ad3ac4ae212b..4d885757bb1a60578a923e05544d6d209b73acf9 100644
--- a/paddle/phi/backends/xpu/CMakeLists.txt
+++ b/paddle/phi/backends/xpu/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place)
-cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info)
+cc_library(phi_xpu_info SRCS xpu_info.cc DEPS enforce xpulib phi_place)
+cc_library(xpu_context SRCS xpu_context.cc DEPS phi_device_context phi_xpu_info)
diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h
index bcfebf6d49fb87b7fa1a0fc29595f6f20ca57f77..29b048ead852dd91788316c2284b438d7dcbd61c 100644
--- a/paddle/phi/backends/xpu/enforce_xpu.h
+++ b/paddle/phi/backends/xpu/enforce_xpu.h
@@ -173,7 +173,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
         ::phi::backends::xpu::details::ExternalApiType<         \
             __XPU_STATUS_TYPE__>::kSuccess;                     \
     if (UNLIKELY(__cond__ != __success_type__)) {               \
-      auto __summary__ = paddle::platform::errors::External(    \
+      auto __summary__ = phi::errors::External(                 \
           ::phi::backends::xpu::build_xpu_error_msg(__cond__)); \
       __THROW_ERROR_INTERNAL__(__summary__);                    \
     }                                                           \
@@ -183,7 +183,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
   do {                                                                    \
     auto __cond__ = (COND);                                               \
     if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) {        \
-      auto __summary__ = paddle::platform::errors::External(              \
+      auto __summary__ = phi::errors::External(                           \
           ::phi::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \
       __THROW_ERROR_INTERNAL__(__summary__);                              \
     }                                                                     \
@@ -192,7 +192,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
 #define PADDLE_ENFORCE_XDNN_NOT_NULL(ptr)                    \
   do {                                                       \
     if (UNLIKELY(ptr == nullptr)) {                          \
-      auto __summary__ = paddle::platform::errors::External( \
+      auto __summary__ = phi::errors::External(              \
           ::phi::backends::xpu::build_xpu_xdnn_error_msg(    \
               baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE, \
               "XPU memory is not enough"));                  \
diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc
index 527e13238082ec154b3ece67ca719425ae40d211..d454fc0734c66aca37a55c53ec5a2d9206cfcc5b 100644
--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/backends/xpu/xpu_header.h"
 #include "paddle/phi/common/place.h"
 
-// TODO(wilber): The pten computing library requires a component to manage
+// TODO(wilber): The phi computing library requires a component to manage
 // flags.
 #include "paddle/fluid/platform/flags.h"
 
@@ -100,7 +100,7 @@ void SetXPUDeviceId(int id) {
   PADDLE_ENFORCE_LT(
       id,
       GetXPUDeviceCount(),
-      paddle::platform::errors::InvalidArgument("id must less than XPU count"));
+      phi::errors::InvalidArgument("id must less than XPU count"));
   PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
 }
 
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index feaf0e12bdb16b04a09814d45a25b6a504a7c697..85a1424ee34e04b50a077f5d8ac88d0a0d2fbe78 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1 +1 @@
-cc_library(pten_place SRCS place.cc)
+cc_library(phi_place SRCS place.cc)
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index f7c39eacae9bd1192def55aedccd04fdfc1ccd33..4b7bf65be39cbc83688e7dab3fdd745c2be82b22 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -50,7 +50,7 @@ enum class Backend : uint8_t {
 
   // the third library backend
   MKLDNN,
-  CUDNN,
+  GPUDNN,  // cuDNN and hipDNN
 
   // end of backend types
   NUM_BACKENDS,
@@ -71,17 +71,17 @@ enum class Backend : uint8_t {
    * Of course, we have also considered solving this problem through different
    * named macros, for example, if we define
    *
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND
    *
    * Based on this design pattern, the dtype and layout also have the same
    * requirements, this cause we need to define a series of macros
    *
-   * PT_REGISTER_KERNEL_FOR_ALL_DTYPE
-   * PT_REGISTER_KERNEL_FOR_ALL_LAYOUT
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_DTYPE
-   * PT_REGISTER_KERNEL_FOR_ALL_LAYOUT_AND_DTYPE
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT_AND_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_LAYOUT
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_LAYOUT_AND_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT_AND_DTYPE
    *
    * It makes the system of registering macros more complicated, we think
    * this is not a simple design, so we still adopt the design of providing
@@ -112,8 +112,8 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::MKLDNN:
       os << "MKLDNN";
       break;
-    case Backend::CUDNN:
-      os << "CUDNN";
+    case Backend::GPUDNN:
+      os << "GPUDNN";
       break;
     default: {
       size_t device_type_id_ = static_cast<size_t>(backend) -
@@ -130,6 +130,29 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
   return os;
 }
 
+inline Backend StringToBackend(const char* backend_cstr) {
+  std::string s(backend_cstr);
+  if (s == std::string("Undefined")) {
+    return Backend::UNDEFINED;
+  }
+  if (s == std::string("CPU")) {
+    return Backend::CPU;
+  } else if (s == std::string("GPU")) {
+    return Backend::GPU;
+  } else if (s == std::string("XPU")) {
+    return Backend::XPU;
+  } else if (s == std::string("NPU")) {
+    return Backend::NPU;
+  } else if (s == std::string("MKLDNN")) {
+    return Backend::MKLDNN;
+  } else if (s == std::string("GPUDNN")) {
+    return Backend::GPUDNN;
+  } else {
+    return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
+                                phi::GetOrRegisterGlobalDeviceTypeId(s));
+  }
+}
+
 }  // namespace experimental
 }  // namespace paddle
 
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 1cdcdef2c12eec1c59c0fd2dfdf1c4dd6e62bd37..6ed9c88d705106ce3b03732096fa34b23422875f 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -988,6 +988,18 @@ inline std::ostream& operator<<(std::ostream& os, const float16& a) {
   return os;
 }
 
+template <typename T>
+class MPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class MPTypeTrait<float16> {
+ public:
+  using Type = float;
+};
+
 }  // namespace dtype
 }  // namespace phi
 
diff --git a/paddle/phi/common/layout.h b/paddle/phi/common/layout.h
index 30832bd60bc0ea167b37de08240aad06c0fe7d1b..648fc02d054cbfd89991e66801c1dac5dffbfe69 100644
--- a/paddle/phi/common/layout.h
+++ b/paddle/phi/common/layout.h
@@ -32,7 +32,7 @@ enum class DataLayout {
   NUM_DATA_LAYOUTS,
   // See Note [ Why we need ALL in basic kernel key member? ]
   ALL_LAYOUT = UNDEFINED,
-  // Note: Unify pten DataLayout and fluid::framework::DataLayout,
+  // Note: Unify phi DataLayout and fluid::framework::DataLayout,
   // for compatible with fluid DataLayout, here need prefix `k`
   // Note: The original `kAnyLayout (enum value 2)` is a strange design.
   // `kAnyLayout` originally cannot represent any kind of Layout,
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index bc179e8fed74e22fd85d7ff9372d816edfdce575..644bf3679af2a3ebf05f739a6e8d42011c7e664c 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -43,7 +43,7 @@ const char *AllocationTypeStr(AllocationType type) {
     case AllocationType::MLU:
       return "mlu";
     default:
-      PD_THROW("Invalid pten device type.");
+      PD_THROW("Invalid phi device type.");
       return {};
   }
 }
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index b6adb1c2932bff5842ef74947c149f23b8b79a02..36fb910cad6c705952a0e3858eb09810d1ea6f5f 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -188,6 +188,7 @@ class MLUPlace : public Place {
 
 class CustomPlace : public Place {
  public:
+  CustomPlace() : Place(AllocationType::CUSTOM, 0, "") {}
   explicit CustomPlace(const std::string dev_type)
       : Place(AllocationType::CUSTOM, 0, dev_type) {}
   CustomPlace(const std::string dev_type, int device_id)
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 1da77a0fa196413436030fc2864514cc222af6f8..72cef89d300c8d60811bde7cf667275b37fedc6f 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -25,7 +25,6 @@ namespace experimental {
 template <typename T>
 class ScalarBase {
  public:
-  bool FromTensor() const { return is_from_tensor_; }
   // Constructor support implicit
   ScalarBase(double val) : dtype_(DataType::FLOAT64) {  // NOLINT
     data_.f64 = val;
@@ -157,6 +156,10 @@ class ScalarBase {
     CopyScalar(other, this);
   }
 
+  bool FromTensor() const { return is_from_tensor_; }
+
+  void SetFromTensor(bool from_tensor) { is_from_tensor_ = from_tensor; }
+
   template <typename RT>
   inline RT to() const {
     switch (dtype_) {
@@ -191,6 +194,8 @@ class ScalarBase {
     }
   }
 
+  DataType dtype() const { return dtype_; }
+
  private:
   template <typename T1, typename T2>
   friend void CopyScalar(const ScalarBase<T1>& src, ScalarBase<T2>* dst);
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 18f209377bafc787268e3e510931661d6dff1cb8..f4f57a0acbbb386a3642a05e0d0dc70cd082a4d8 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -2,28 +2,30 @@
 add_subdirectory(compat)
 
 cc_library(errors SRCS errors.cc)
-set(pten_enforce_deps errors flags)
+set(phi_enforce_deps errors flags)
 if(WITH_GPU)
-  set(pten_enforce_deps ${pten_enforce_deps} external_error_proto)
+  set(phi_enforce_deps ${phi_enforce_deps} external_error_proto)
 endif()
-cc_library(pten_enforce INTERFACE SRCS enforce.cc DEPS ${pten_enforce_deps})
+cc_library(phi_enforce INTERFACE SRCS enforce.cc DEPS ${phi_enforce_deps})
 
-cc_library(kernel_factory SRCS kernel_factory.cc DEPS pten_enforce fluid_convert_utils)
-cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context)
+cc_library(kernel_factory SRCS kernel_factory.cc DEPS phi_enforce fluid_convert_utils)
+cc_library(kernel_context SRCS kernel_context.cc DEPS phi_enforce phi_context)
 
-cc_library(ddim SRCS ddim.cc DEPS pten_enforce)
-cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS pten_enforce)
-cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce mixed_vector)
-cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector)
+cc_library(ddim SRCS ddim.cc DEPS phi_enforce)
+cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS phi_enforce)
+cc_library(tensor_meta SRCS tensor_meta.cc DEPS phi_enforce)
+cc_library(lod_utils SRCS lod_utils.cc DEPS phi_enforce)
 
-cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base)
 cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base)
 cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base)
 cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base)
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor mixed_vector pten_enforce ddim)
+cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
+cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
+
+cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
 
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
diff --git a/paddle/phi/core/compat/CMakeLists.txt b/paddle/phi/core/compat/CMakeLists.txt
index c6bc9e15a535b52def1caef463a8a9228ab51e4a..3423e380970df8a69dc047325e80024dece1f914 100644
--- a/paddle/phi/core/compat/CMakeLists.txt
+++ b/paddle/phi/core/compat/CMakeLists.txt
@@ -1,14 +1,14 @@
-cc_library(arg_map_context SRCS arg_map_context.cc DEPS pten_enforce)
+cc_library(arg_map_context SRCS arg_map_context.cc DEPS phi_enforce)
 cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce)
 
 set(convert_utils_deps data_type place op_utils)
 
 if(WITH_GPU)
-  set(convert_utils_deps ${convert_utils_deps} pten_gpu_info)
+  set(convert_utils_deps ${convert_utils_deps} phi_gpu_info)
 elseif(WITH_ROCM)
-  set(convert_utils_deps ${convert_utils_deps} pten_gpu_info)
+  set(convert_utils_deps ${convert_utils_deps} phi_gpu_info)
 elseif(WITH_XPU)
-  set(convert_utils_deps ${convert_utils_deps} pten_xpu_info)
+  set(convert_utils_deps ${convert_utils_deps} phi_xpu_info)
 endif()
 if(WITH_CUSTOM_DEVICE)
   set(convert_utils_deps ${convert_utils_deps} device_manager)
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 39cb3fb5692679ccd624fd2d79bec2bbeb04d257..af29b3bab5c3cc4b2e1caeb4eee9689179464d01 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -79,7 +79,7 @@ class ArgumentMappingContext {
   virtual bool HasOutput(const std::string& name) const = 0;
   virtual bool HasAttr(const std::string& name) const = 0;
 
-  // now we can't use Attribute here, it will cause pten relay on
+  // now we can't use Attribute here, it will cause phi relay on
   // boost::variant and BlockDesc
   virtual paddle::any Attr(const std::string& name) const = 0;
 
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index b4e7e127995ec2d0eeda788e9d6e6f9ccf12f8b1..3b7a733ede90464328600ebd3c7d371314b99cc3 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -25,11 +25,13 @@ limitations under the License. */
 
 namespace phi {
 
-Backend TransToPtenBackend(const phi::Place& place) {
+Backend TransToPhiBackend(const phi::Place& place) {
   if (place.GetType() == phi::AllocationType::CPU) {
     return Backend::CPU;
   } else if (place.GetType() == phi::AllocationType::GPU) {
     return Backend::GPU;
+  } else if (place.GetType() == phi::AllocationType::XPU) {
+    return Backend::XPU;
   } else if (place.GetType() == phi::AllocationType::CUSTOM) {
     return static_cast<Backend>(
         static_cast<size_t>(Backend::NUM_BACKENDS) +
@@ -39,7 +41,7 @@ Backend TransToPtenBackend(const phi::Place& place) {
   }
 }
 
-phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id) {
+phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
   // NOTE(zhiqiu): GetCurrentDeviceId not always success, and device id is not
   // always needed.
   // So, add set_device_id parameter here.
@@ -56,7 +58,7 @@ phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id) {
       return phi::CPUPlace();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case phi::Backend::CUDNN:
+    case phi::Backend::GPUDNN:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #endif
@@ -85,21 +87,21 @@ phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id) {
   }
 }
 
-std::string TransToPtenKernelName(const std::string& fluid_op_name) {
+std::string TransToPhiKernelName(const std::string& fluid_op_name) {
   return OpUtilsMap::Instance().GetBaseKernelName(fluid_op_name);
 }
 
-const std::string& TransToFluidOpName(const std::string& pten_kernel_name) {
+const std::string& TransToFluidOpName(const std::string& phi_kernel_name) {
   auto& base_kernel_name_map = OpUtilsMap::Instance().base_kernel_name_map();
   auto it = std::find_if(base_kernel_name_map.begin(),
                          base_kernel_name_map.end(),
-                         [&pten_kernel_name](const auto& pair) {
-                           return pair.second == pten_kernel_name;
+                         [&phi_kernel_name](const auto& pair) {
+                           return pair.second == phi_kernel_name;
                          });
   if (it != base_kernel_name_map.end()) {
     return it->first;
   }
-  return pten_kernel_name;
+  return phi_kernel_name;
 }
 
 }  // namespace phi
diff --git a/paddle/phi/core/compat/convert_utils.h b/paddle/phi/core/compat/convert_utils.h
index 058f0ecdf7bc2b5c81a55eb1a6e94cb5ddc30296..621459764873e6681d57813b227076db0b44dd04 100644
--- a/paddle/phi/core/compat/convert_utils.h
+++ b/paddle/phi/core/compat/convert_utils.h
@@ -22,10 +22,10 @@ limitations under the License. */
 
 namespace phi {
 
-std::string TransToPtenKernelName(const std::string& fluid_op_name);
-const std::string& TransToFluidOpName(const std::string& pten_kernel_name);
+std::string TransToPhiKernelName(const std::string& fluid_op_name);
+const std::string& TransToFluidOpName(const std::string& phi_kernel_name);
 
-Backend TransToPtenBackend(const phi::Place& place);
-phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id = true);
+Backend TransToPhiBackend(const phi::Place& place);
+phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id = true);
 
 }  // namespace phi
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 5c0c440d8942c83d10bfe092b3fc1782944f1719..bbf634b4b09b90a086505bc173b588d7da2e9668 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -37,7 +37,8 @@ const std::unordered_set<std::string> standard_kernel_suffixs({
  * after 2.0, and can no longer be occupied by the previously abandoned ops.
  * They are marked here uniformly.
  */
-const std::unordered_set<std::string> deprecated_op_names({"flatten",
+const std::unordered_set<std::string> deprecated_op_names({"diag",
+                                                           "flatten",
                                                            "flatten_grad",
                                                            "matmul",
                                                            "matmul_grad",
@@ -164,34 +165,34 @@ struct ArgumentMappingFnRegistrar {
   }
 };
 
-#define PT_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
+#define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      pt_register_base_kernel_name_ns_check_##op_type,                         \
-      "PT_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
+      PD_REGISTER_base_kernel_name_ns_check_##op_type,                         \
+      "PD_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
   static const ::phi::BaseKernelNameRegistrar                                  \
       __registrar_base_kernel_name_for_##op_type(#op_type, #base_kernel_name); \
   int TouchBaseKernelNameSymbol_##op_type() { return 0; }
 
-#define PT_DECLARE_BASE_KERNEL_NAME(op_type)                              \
+#define PD_DECLARE_BASE_KERNEL_NAME(op_type)                              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      pt_declare_ai_name_ns_check_##op_type,                              \
-      "PT_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
+      PD_DECLARE_ai_name_ns_check_##op_type,                              \
+      "PD_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
   extern int TouchBaseKernelNameSymbol_##op_type();                       \
   UNUSED static int __declare_base_kernel_name_symbol_for_##op_type =     \
       TouchBaseKernelNameSymbol_##op_type()
 
-#define PT_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
+#define PD_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
-      pt_register_arg_map_fn_ns_check_##op_type,                         \
-      "PT_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
+      PD_REGISTER_arg_map_fn_ns_check_##op_type,                         \
+      "PD_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
   static const ::phi::ArgumentMappingFnRegistrar                         \
       __registrar_arg_map_fn_for_##op_type(#op_type, arg_mapping_fn);    \
   int TouchArgumentMappingFnSymbol_##op_type() { return 0; }
 
-#define PT_DECLARE_ARG_MAPPING_FN(op_type)                              \
+#define PD_DECLARE_ARG_MAPPING_FN(op_type)                              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
-      pt_declare_arg_map_fn_ns_check_##op_type,                         \
-      "PT_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
+      PD_DECLARE_arg_map_fn_ns_check_##op_type,                         \
+      "PD_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
   extern int TouchArgumentMappingFnSymbol_##op_type();                  \
   UNUSED static int __declare_arg_map_fn_symbol_for_##op_type =         \
       TouchArgumentMappingFnSymbol_##op_type()
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f84a2bd8d9c5d0634f29485fc07f649ea9fb1b9e
--- /dev/null
+++ b/paddle/phi/core/custom_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/custom_kernel.h"
+
+namespace phi {
+
+void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
+  auto& kernel_info_map = custom_kernel_map.GetMap();
+  VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
+
+  for (auto& pair : kernel_info_map) {
+    PADDLE_ENFORCE_EQ(
+        KernelFactory::Instance().HasCompatiblePhiKernel(pair.first),
+        true,
+        phi::errors::InvalidArgument(
+            "The kernel %s is not ready for custom kernel registering.",
+            pair.first));
+
+    for (auto& info_pair : pair.second) {
+      auto& kernels = KernelFactory::Instance().kernels();
+      PADDLE_ENFORCE_EQ(
+          kernels[pair.first].find(info_pair.first),
+          kernels[pair.first].end(),
+          phi::errors::InvalidArgument(
+              "The operator <%s>'s kernel: %s has been already existed "
+              "in Paddle, please contribute PR if it is necessary "
+              "to optimize the kernel code. Custom kernel does NOT support "
+              "to replace existing kernel in Paddle.",
+              pair.first,
+              info_pair.first));
+
+      kernels[pair.first][info_pair.first] = info_pair.second;
+
+      VLOG(3) << "Successed in registering operator <" << pair.first
+              << ">'s kernel: " << info_pair.first
+              << " to Paddle. It will be used like native ones.";
+    }
+  }
+}
+
+}  // namespace phi
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C-API to get global CustomKernelMap.
+phi::CustomKernelMap& PD_GetCustomKernelMap() {
+  return phi::CustomKernelMap::Instance();
+}
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif
diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..20ae2b7bb7360ab6878617234784157584e01858
--- /dev/null
+++ b/paddle/phi/core/custom_kernel.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/core/macros.h"
+
+namespace phi {
+/**
+ * Note:
+ * Used to store kernels' info before registered to KernelFactory.
+ */
+class CustomKernelMap {
+ public:
+  static CustomKernelMap& Instance() {
+    static CustomKernelMap g_custom_kernel_info_map;
+    return g_custom_kernel_info_map;
+  }
+
+  KernelNameMap& Kernels() { return kernels_; }
+
+  const KernelNameMap& GetMap() const { return kernels_; }
+
+ private:
+  CustomKernelMap() = default;
+  DISABLE_COPY_AND_ASSIGN(CustomKernelMap);
+
+  KernelNameMap kernels_;
+};
+
+/**
+ * Note:
+ * Used to register custom kernels to KernelFactory.
+ */
+void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map);
+
+}  // namespace phi
diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h
index 1d186fe3b43fe00965db2ff32c51d43d6b7a3c11..ce462d8d954023a1ccd2ff4d33e1cf9611b40513 100644
--- a/paddle/phi/core/ddim.h
+++ b/paddle/phi/core/ddim.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <initializer_list>
+#include <numeric>
 #include <stdexcept>
 #include <string>
 #include <vector>
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index a363d3cbaaa340e183dfa3281800db4a9f72b104..7a0f50533360d71e8cd025a520d753c366c08edb 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -73,7 +73,7 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
                                 size_t requested_size) {
   PADDLE_ENFORCE_NOT_NULL(
       allocator,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Required allocator shall not be nullptr, but received nullptr."));
   if (this->dtype() != dtype) {
     VLOG(10) << "change data type in mutbale_data, target dtype - " << dtype;
@@ -81,22 +81,22 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
   }
   PADDLE_ENFORCE(
       valid(),
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "The meta data must be valid when call the mutable data function."));
   size_t bytes = numel() * SizeOf(this->dtype());
   if (requested_size) {
     PADDLE_ENFORCE_GE(requested_size,
                       bytes,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The reserved size %d should be enough to meet the "
                           "volume required by metadata %d.",
                           requested_size,
                           bytes));
     bytes = requested_size;
   }
-  // TODO(paddle-dev): In case of the allocator of storage_ is different with
-  // the incoming allocator, we should re-alloc data using the incoming
-  // allocator.
+  // NOTE(paddle-dev): In case of the allocator of storage_ is different with
+  // the incoming allocator, we will re-alloc data using the incoming
+  // allocator. See DeviceContext.Alloc in core/device_context.cc.
   if (!holder_ || holder_->size() < bytes + meta_.offset) {
     meta_.offset = 0;
     VLOG(10) << "Allocate data with bytes: " << bytes;
@@ -112,7 +112,7 @@ const T* DenseTensor::data() const {
   check_memory_size();
   PADDLE_ENFORCE(
       (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The type of data we are trying to retrieve does not match the "
           "type of data currently contained in the container."));
   return static_cast<const T*>(data());
@@ -123,7 +123,7 @@ T* DenseTensor::data() {
   check_memory_size();
   PADDLE_ENFORCE(
       (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The type of data we are trying to retrieve does not match the "
           "type of data currently contained in the container."));
   return static_cast<T*>(data());
@@ -133,7 +133,7 @@ void* DenseTensor::data() {
   check_memory_size();
   PADDLE_ENFORCE_NOT_NULL(
       holder_,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "The storage must be valid when call the data function."));
   return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  meta_.offset);
@@ -143,7 +143,7 @@ const void* DenseTensor::data() const {
   check_memory_size();
   PADDLE_ENFORCE_NOT_NULL(
       holder_,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "The storage must be valid when call the data function."));
   return reinterpret_cast<const void*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + meta_.offset);
@@ -151,7 +151,7 @@ const void* DenseTensor::data() const {
 
 void DenseTensor::set_meta(DenseTensorMeta&& meta) {
   PADDLE_ENFORCE(!meta_.valid(),
-                 paddle::platform::errors::InvalidArgument(
+                 phi::errors::InvalidArgument(
                      "Only when the original attribute of Tensor is "
                      "incomplete, can it be reset."));
   meta_ = std::move(meta);
@@ -160,7 +160,7 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
 void DenseTensor::set_meta(const DenseTensorMeta& meta) {
   PADDLE_ENFORCE(
       meta.valid(),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Input meta is invalid, please check the meta attribute."));
   meta_.dims = meta.dims;
   meta_.dtype = meta.dtype;
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 622cedf1d7f91e843efe979c40b9cb298ca3181f..0dddd63099bbca66281c747fd35b8346a2ded726 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -171,6 +171,9 @@ class DenseTensor : public TensorBase,
   DenseTensorMeta meta_;
   std::shared_ptr<phi::Allocation> holder_;
 
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/dense_tensor.inl"
+#endif
 };
+
 }  // namespace phi
diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl
index 0547776acad1f3e08752f8ee14d7acf235bdfab4..a422a95346e8b65e91a7404d70c213847e1dcf3e 100644
--- a/paddle/phi/core/dense_tensor.inl
+++ b/paddle/phi/core/dense_tensor.inl
@@ -54,22 +54,22 @@ DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
 inline bool IsInitialized() const { return holder_ != nullptr; }
 
 template <typename T>
-T* mutable_data(const paddle::platform::Place& place,
+T* mutable_data(const phi::Place& place,
                 size_t requested_size = 0);
 
 template <typename T>
 T* mutable_data(const DDim& dims,
-                const paddle::platform::Place& place,
+                const phi::Place& place,
                 size_t requested_size = 0);
 
-void* mutable_data(const paddle::platform::Place& place,
+void* mutable_data(const phi::Place& place,
                     paddle::experimental::DataType type,
                     size_t requested_size = 0);
 
-void* mutable_data(const paddle::platform::Place& place,
+void* mutable_data(const phi::Place& place,
                     size_t requested_size = 0);
 
-void* mutable_data(const paddle::platform::Place& place,
+void* mutable_data(const phi::Place& place,
                     paddle::experimental::DataType type,
                     const phi::Stream& stream);
 
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 6ce8bea35d9dd68353a6677b6e59d3e004c68185..29e7dc01f32db20e3756677fe8a48fcb138b3883 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -161,7 +161,7 @@ void* DenseTensor::mutable_data(const Place& place,
 /* @jim19930609: The following "mutable_data" only supports specific dtypes
    defined in OpProto. This part need another clean up once the data type across
    Fluid
-   and Pten get unified.
+   and Phi get unified.
    */
 template <typename T>
 inline T* DenseTensor::mutable_data(const DDim& dims,
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index c3e0d2a75228b3211e5d76f95c2f8ff8089b6415..b139eb99dd4846adb3f7ef3a27507a2ca4478e6d 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -13,8 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/tensor_base.h"
+#include "paddle/phi/core/selected_rows.h"
 
 namespace phi {
 using DataType = paddle::experimental::DataType;
@@ -72,6 +73,7 @@ struct DeviceContext::Impl {
   }
 
   void* Alloc(TensorBase* tensor,
+              const Place& place,
               DataType dtype = DataType::UNDEFINED,
               size_t requested_size = 0) const {
     PADDLE_ENFORCE_NOT_NULL(
@@ -81,6 +83,12 @@ struct DeviceContext::Impl {
     if (dtype == DataType::UNDEFINED) {
       dtype = tensor->dtype();
     }
+    // NOTE(paddle-dev): In case of tensor has already hold allocation and
+    // is going to allocate allocation on new place, we will clear its holder
+    // firstly and then re-alloc it.
+    if (tensor->initialized() && tensor->place() != place) {
+      ClearHolder(tensor);
+    }
     auto* allocator =
         tensor->numel() == 0 ? zero_allocator_ : device_allocator_;
     return tensor->AllocateFrom(
@@ -88,9 +96,11 @@ struct DeviceContext::Impl {
   }
 
   template <typename T>
-  T* Alloc(TensorBase* tensor, size_t requested_size = 0) const {
+  T* Alloc(TensorBase* tensor,
+           const Place& place,
+           size_t requested_size = 0) const {
     DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
-    return static_cast<T*>(Alloc(tensor, dtype, requested_size));
+    return static_cast<T*>(Alloc(tensor, place, dtype, requested_size));
   }
 
   void* HostAlloc(TensorBase* tensor,
@@ -103,6 +113,9 @@ struct DeviceContext::Impl {
     if (dtype == DataType::UNDEFINED) {
       dtype = tensor->dtype();
     }
+    if (tensor->initialized() && tensor->place() != CPUPlace()) {
+      ClearHolder(tensor);
+    }
     auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_;
     return tensor->AllocateFrom(
         const_cast<Allocator*>(allocator), dtype, requested_size);
@@ -119,22 +132,52 @@ struct DeviceContext::Impl {
         gen,
         phi::errors::InvalidArgument(
             "Required generator shall not be nullptr, but received nullptr."));
-    generator_ = gen;
+    device_generator_ = gen;
   }
 
   Generator* GetGenerator() const {
     PADDLE_ENFORCE_NOT_NULL(
-        generator_,
+        device_generator_,
+        phi::errors::InvalidArgument("Required generator_ shall not be "
+                                     "nullptr, but received nullptr."));
+    return device_generator_;
+  }
+
+  void SetHostGenerator(Generator* gen) {
+    PADDLE_ENFORCE_NOT_NULL(
+        gen,
+        phi::errors::InvalidArgument(
+            "Required generator shall not be nullptr, but received nullptr."));
+    host_generator_ = gen;
+  }
+
+  Generator* GetHostGenerator() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        host_generator_,
         phi::errors::InvalidArgument("Required generator_ shall not be "
                                      "nullptr, but received nullptr."));
-    return generator_;
+    return host_generator_;
   }
 
  private:
+  void ClearHolder(TensorBase* tensor) const {
+    if (!tensor->initialized()) return;
+
+    if (DenseTensor::classof(tensor)) {
+      static_cast<DenseTensor*>(tensor)->clear();
+    } else if (SelectedRows::classof(tensor)) {
+      static_cast<SelectedRows*>(tensor)->mutable_value()->clear();
+    } else {
+      PADDLE_THROW(errors::Unimplemented(
+          "Only support DenseTensor and SelectedRows now."));
+    }
+  }
+
   const Allocator* device_allocator_{nullptr};
   const Allocator* host_allocator_{nullptr};
   const Allocator* zero_allocator_{nullptr};
-  Generator* generator_{nullptr};
+  Generator* device_generator_{nullptr};
+  Generator* host_generator_{nullptr};
 };
 
 DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
@@ -143,13 +186,15 @@ DeviceContext::DeviceContext(const DeviceContext& other) {
   impl_->SetHostAllocator(&other.GetHostAllocator());
   impl_->SetAllocator(&other.GetAllocator());
   impl_->SetZeroAllocator(&other.GetZeroAllocator());
+  impl_->SetHostGenerator(other.GetHostGenerator());
+  impl_->SetGenerator(other.GetGenerator());
 }
 
 DeviceContext::DeviceContext(DeviceContext&& other) {
   impl_ = std::move(other.impl_);
 }
 
-DeviceContext& DeviceContext::operator=(DeviceContext&&) = default;
+DeviceContext& DeviceContext::operator=(DeviceContext&& other) = default;
 
 DeviceContext::~DeviceContext() = default;
 
@@ -180,12 +225,12 @@ const Allocator& DeviceContext::GetZeroAllocator() const {
 void* DeviceContext::Alloc(TensorBase* tensor,
                            DataType dtype,
                            size_t requested_size) const {
-  return impl_->Alloc(tensor, dtype, requested_size);
+  return impl_->Alloc(tensor, GetPlace(), dtype, requested_size);
 }
 
 template <typename T>
 T* DeviceContext::Alloc(TensorBase* tensor, size_t requested_size) const {
-  return impl_->Alloc<T>(tensor, requested_size);
+  return impl_->Alloc<T>(tensor, GetPlace(), requested_size);
 }
 
 void* DeviceContext::HostAlloc(TensorBase* tensor,
@@ -224,4 +269,12 @@ void DeviceContext::SetGenerator(Generator* gen) { impl_->SetGenerator(gen); }
 
 Generator* DeviceContext::GetGenerator() const { return impl_->GetGenerator(); }
 
+void DeviceContext::SetHostGenerator(Generator* gen) {
+  impl_->SetHostGenerator(gen);
+}
+
+Generator* DeviceContext::GetHostGenerator() const {
+  return impl_->GetHostGenerator();
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h
index 7c1411e3bef3740f11ff39947028ead4d0357771..689f4e4e66d15f60aec873a9e9b9c07797833487 100644
--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -132,6 +132,19 @@ class DeviceContext {
    */
   Generator* GetGenerator() const;
 
+  /**
+  * @brief Set the host generator for special op.
+  *
+  * @param Generator
+  */
+  void SetHostGenerator(Generator*);
+  /**
+   * @brief Get the host generator object.
+   *
+   * @return Generator
+   */
+  Generator* GetHostGenerator() const;
+
  private:
   struct Impl;
   std::unique_ptr<Impl> impl_;
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index 08fe3125287d76654173324e42a2d0773aab444c..0869df143235fcd937d75e7dba908c4efbd7ee95 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -18,14 +18,14 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef __xpu_kp__
+#if defined(__xpu__)
 #include <xpu/runtime.h>
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu_kp__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc
index d21232ed82296cb48af5c72a32264e5c8fd76085..f3dd056911ecf81d5ca0954114acbd1a3ac19ad9 100644
--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -67,6 +67,14 @@ const MetaTensor& InferMetaContext::InputAt(size_t idx) const {
   return *inputs_.at(idx);
 }
 
+paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt(
+    size_t idx) const {
+  const auto& input = inputs_.at(idx);
+  return input ? paddle::optional<const phi::MetaTensor&>{static_cast<
+                     const phi::MetaTensor&>(*input)}
+               : paddle::optional<const phi::MetaTensor&>{paddle::none};
+}
+
 std::vector<MetaTensor> InferMetaContext::InputsBetween(size_t start,
                                                         size_t end) const {
   std::vector<MetaTensor> result;
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 2b98ab22bcdbd43a1863c2d59d93e31c510368b8..203dbb269841ec8616b94c89603af3904eb572c3 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/type_defs.h"
+#include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 
@@ -50,6 +51,9 @@ class InferMetaContext {
 
   const MetaConfig& GetMetaConfig() const;
   const MetaTensor& InputAt(size_t idx) const;
+
+  paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const;
+
   std::vector<MetaTensor> InputsBetween(size_t start, size_t end) const;
   MetaTensor* MutableOutputAt(size_t idx);
   std::vector<MetaTensor> MutableOutputBetween(size_t start, size_t end);
@@ -134,6 +138,24 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
     }
   };
 
+  template <typename... Tail>
+  struct InferMetaFnCallHelper<paddle::optional<const MetaTensor&>, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
+      static_assert(attr_idx == 0,
+                    "InferMeta's Input should appear before Attributes.");
+      static_assert(out_idx == 0,
+                    "InferMeta's Input should appear before Outputs.");
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
+      auto arg = ctx->OptionalInputAt(range.first);
+
+      InferMetaFnCallHelper<
+          Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
+                                                                 pargs...,
+                                                                 arg);
+    }
+  };
+
   template <typename... Tail>
   struct InferMetaFnCallHelper<const std::vector<MetaTensor>&, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
@@ -282,10 +304,10 @@ struct InferMetaFnRegistrar {
   }
 };
 
-#define PT_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
+#define PD_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      pt_register_infer_meta_fn_ns_check_##kernel_name_prefix,                \
-      "PT_REGISTER_INFER_META_FN must be called in global namespace.");       \
+      PD_REGISTER_infer_meta_fn_ns_check_##kernel_name_prefix,                \
+      "PD_REGISTER_INFER_META_FN must be called in global namespace.");       \
   static const ::phi::InferMetaFnRegistrar                                    \
       __registrar_arg_map_fn_for_##kernel_name_prefix(                        \
           #kernel_name_prefix, PT_INFER_META(variadic_infer_meta_fn))
diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc
index 3c7222f7a5379fe1f9d6c87ffdb38d6e6a8fa48c..a32e0e44f469694c62ff33863971d3b04004ff37 100644
--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -69,7 +69,7 @@ void KernelContext::AssignInputRange(std::pair<int, int>&& range, size_t idx) {
   } else if (idx == input_range_.size()) {
     input_range_.emplace_back(range);
   } else {
-    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "Invalid idx when trying to set InputRange, "
         "index is `%d`, it is greater than the size(%d) of InputRange.",
         idx,
@@ -83,7 +83,7 @@ void KernelContext::AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
   } else if (idx == output_range_.size()) {
     output_range_.emplace_back(range);
   } else {
-    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "Invalid idx when trying to set InputRange, "
         "index is `%d`, it is greater than the size(%d) of InputRange.",
         idx,
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 0b960004fcb2729181b8f8d91b7d4cb041b01ca8..57e2db60c24caea8cbac323d9c47bdb53acc8a8c 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -22,6 +22,7 @@
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/utils/any.h"
+#include "paddle/utils/optional.h"
 #include "paddle/utils/small_vector.h"
 
 namespace phi {
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index b31bedd958b4b5bfdf32e80ab81e44dd3307e520..be91409762635e8aabdd6953aa5527d94959e4b2 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -225,8 +225,8 @@ class KernelFactory {
 
   KernelNameMap& kernels() { return kernels_; }
 
-  bool HasCompatiblePtenKernel(const std::string& op_type) const {
-    return kernels_.find(TransToPtenKernelName(op_type)) != kernels_.end();
+  bool HasCompatiblePhiKernel(const std::string& op_type) const {
+    return kernels_.find(TransToPhiKernelName(op_type)) != kernels_.end();
   }
 
   const Kernel& SelectKernelOrThrowError(const std::string& kernel_name,
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 577e9e28cf3791880a34114201075447f6d9eaf0..6a1688947b986549e1feaf39cdf6c73749b0ff3a 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -21,6 +21,7 @@
 #include <typeinfo>
 #include <vector>
 
+#include "paddle/phi/core/custom_kernel.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/kernel_utils.h"
 #include "paddle/phi/core/macros.h"
@@ -62,6 +63,9 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
 #elif defined(PADDLE_WITH_XPU)
           ||
           arg_type == std::type_index(typeid(const XPUContext&))) {
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+          ||
+          arg_type == std::type_index(typeid(const CustomContext&))) {
 #else
               ) {
 #endif
@@ -83,11 +87,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+#endif
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
@@ -99,11 +105,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+#endif
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
@@ -121,20 +129,28 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
   }
 };
 
+// NOTE: used for making a difference between inner or outer registration.
+enum class RegType : uint8_t {
+  INNER = 0,
+  OUTER,
+};
+
 // TODO(chenweihang): Polish the kernel selection logic, support the selection
 // of ALL_DTYPE kernel, and simplify the constructor
 struct KernelRegistrar {
  public:
-  KernelRegistrar(const char* kernel_name_cstr,
-                  Backend backend,
+  KernelRegistrar(RegType reg_type,
+                  const char* kernel_name_cstr,
+                  const char* backend_cstr,
                   DataLayout layout,
                   DataType dtype,
                   KernelArgsParseFn args_parse_fn,
                   KernelArgsDefFn args_def_fn,
                   KernelFn kernel_fn,
                   void* variadic_kernel_fn) {
-    ConstructKernel(kernel_name_cstr,
-                    backend,
+    ConstructKernel(reg_type,
+                    kernel_name_cstr,
+                    backend_cstr,
                     layout,
                     dtype,
                     args_parse_fn,
@@ -143,8 +159,9 @@ struct KernelRegistrar {
                     variadic_kernel_fn);
   }
 
-  KernelRegistrar(const char* kernel_name_cstr,
-                  Backend backend,
+  KernelRegistrar(RegType reg_type,
+                  const char* kernel_name_cstr,
+                  const char* backend_cstr,
                   DataLayout layout,
                   KernelArgsParseFn args_parse_fn,
                   KernelArgsDefFn args_def_fn,
@@ -160,8 +177,9 @@ struct KernelRegistrar {
           dtype == static_cast<size_t>(DataType::UINT16)) {
         continue;
       }
-      ConstructKernel(kernel_name_cstr,
-                      backend,
+      ConstructKernel(reg_type,
+                      kernel_name_cstr,
+                      backend_cstr,
                       layout,
                       static_cast<DataType>(dtype),
                       args_parse_fn,
@@ -172,8 +190,9 @@ struct KernelRegistrar {
   }
 
  private:
-  void ConstructKernel(const char* kernel_name_cstr,
-                       Backend backend,
+  void ConstructKernel(RegType reg_type,
+                       const char* kernel_name_cstr,
+                       const char* backend_cstr,
                        DataLayout layout,
                        DataType dtype,
                        KernelArgsParseFn args_parse_fn,
@@ -181,11 +200,16 @@ struct KernelRegistrar {
                        KernelFn kernel_fn,
                        void* variadic_kernel_fn) {
     std::string kernel_name(kernel_name_cstr);
-    KernelKey kernel_key(backend, layout, dtype);
+    KernelKey kernel_key(
+        paddle::experimental::StringToBackend(backend_cstr), layout, dtype);
     Kernel kernel(kernel_fn, variadic_kernel_fn);
     args_parse_fn(kernel_key, kernel.mutable_args_def());
     args_def_fn(kernel_key, &kernel);
-    KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
+    if (reg_type == RegType::INNER) {
+      KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
+    } else {
+      CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel;
+    }
   }
 };
 
@@ -210,7 +234,7 @@ struct KernelRegistrar {
 #define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
 #define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-/** PT_REGISTER_KERNEL
+/** PD_REGISTER_KERNEL
  *
  * The most frequently used kernel registration macro, used for kernel
  * registration with only data type as template parameter, and the function
@@ -219,22 +243,39 @@ struct KernelRegistrar {
  *
  * Note: `2TA` means `2 template argument`
  */
-#define PT_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      pt_register_tp_kernel_ns_check_##kernel_name##_##backend##_##layout,    \
-      "PT_REGISTER_KERNEL must be called in global namespace.");              \
-  PT_EXPAND(_PT_REGISTER_2TA_KERNEL(                                          \
-      kernel_name, backend, layout, meta_kernel_fn, __VA_ARGS__))
+#define PD_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PD_REGISTER_KERNEL(::phi::RegType::INNER,                                  \
+                      kernel_name,                                            \
+                      backend,                                                \
+                      ::phi::backend##Context,                                \
+                      layout,                                                 \
+                      meta_kernel_fn,                                         \
+                      __VA_ARGS__)
+
+#define _PD_REGISTER_KERNEL(                                               \
+    reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)  \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+      PD_REGISTER_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PD_REGISTER_KERNEL must be called in global namespace.");           \
+  PT_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
+                                    kernel_name,                           \
+                                    backend,                               \
+                                    context,                               \
+                                    layout,                                \
+                                    meta_kernel_fn,                        \
+                                    __VA_ARGS__))
 
 #ifndef _WIN32
-#define _PT_REGISTER_2TA_KERNEL(                                            \
-    kernel_name, backend, layout, meta_kernel_fn, ...)                      \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, __VA_ARGS__);            \
+#define _PD_REGISTER_2TA_KERNEL(                                            \
+    reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   PT_KERNEL_REGISTRAR_INIT(                                                 \
+      reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
+      context,                                                              \
       layout,                                                               \
       &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
@@ -254,13 +295,15 @@ struct KernelRegistrar {
  *
  * And msvc can work without template instantiation
  */
-#define _PT_REGISTER_2TA_KERNEL(                                            \
-    kernel_name, backend, layout, meta_kernel_fn, ...)                      \
+#define _PD_REGISTER_2TA_KERNEL(                                            \
+    reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   PT_EXPAND(PT_KERNEL_REGISTRAR_INIT(                                       \
+      reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
+      context,                                                              \
       layout,                                                               \
       &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
@@ -269,82 +312,119 @@ struct KernelRegistrar {
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, ...) \
-  _PT_KERNEL_INSTANTIATION(                                   \
-      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                    \
-  (meta_kernel_fn, backend, __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype)  \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>) \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>
-#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, __VA_ARGS__))
-
-#define PT_KERNEL_REGISTRAR_INIT(                                   \
-    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, ...) \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__),        \
-                                      kernel_name,                  \
-                                      backend,                      \
-                                      layout,                       \
-                                      args_def_fn,                  \
-                                      meta_kernel_fn,               \
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
+  _PT_KERNEL_INSTANTIATION(                                            \
+      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                             \
+  (meta_kernel_fn, backend, context, __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION_1(              \
+    meta_kernel_fn, backend, context, cpp_dtype) \
+  template decltype(                             \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>
+#define _PT_KERNEL_INSTANTIATION_2(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_3(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_4(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_5(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_6(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_7(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_8(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_9(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_10(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_11(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_12(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_13(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_14(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_15(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+
+#define PT_KERNEL_REGISTRAR_INIT(reg_type,                   \
+                                 kernel_name,                \
+                                 backend,                    \
+                                 context,                    \
+                                 layout,                     \
+                                 args_def_fn,                \
+                                 meta_kernel_fn,             \
+                                 ...)                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \
+                                      reg_type,              \
+                                      kernel_name,           \
+                                      backend,               \
+                                      context,               \
+                                      layout,                \
+                                      args_def_fn,           \
+                                      meta_kernel_fn,        \
                                       __VA_ARGS__))
 
 // clang-format off
@@ -352,15 +432,19 @@ struct KernelRegistrar {
 /* The =pre-commit always treats this macro into the wrong format,
   and multi-line macros cannot be skipped with NOLINT.*/
 #define _PT_KERNEL_REGISTRAR_INIT(N,                       \
+                                  reg_type,                \
                                   kernel_name,             \
                                   backend,                 \
+                                  context,                 \
                                   layout,                  \
                                   args_def_fn,             \
                                   meta_kernel_fn,          \
                                   ...)                     \
   PT_EXPAND(PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
+    reg_type,                                              \
     kernel_name,                                           \
     backend,                                               \
+    context,                                               \
     layout,                                                \
     PT_ID,                                                 \
     args_def_fn,                                           \
@@ -369,413 +453,492 @@ struct KernelRegistrar {
 
 // clang-format on
 
-#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype)                                 \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
+#define _PT_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype)                                \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
-#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-
-/** PT_REGISTER_GENERAL_KERNEL
+/** PD_REGISTER_GENERAL_KERNEL
  *
  * Basic Kernel register marco, used to register a instantiated kernel function
  * with one template argument.
  */
 
-#define PT_REGISTER_GENERAL_KERNEL(                                          \
-    kernel_name, backend, layout, kernel_fn, dtype)                          \
+#define PD_REGISTER_GENERAL_KERNEL(                 \
+    kernel_name, backend, layout, kernel_fn, dtype) \
+  _PD_REGISTER_GENERAL_KERNEL(                      \
+      ::phi::RegType::INNER, kernel_name, backend, layout, kernel_fn, dtype)
+
+#define _PD_REGISTER_GENERAL_KERNEL(                                         \
+    reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      pt_register_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
-  _PT_REGISTER_GENERAL_KERNEL(kernel_name, backend, layout, kernel_fn, dtype)
+      PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
+  __PD_REGISTER_GENERAL_KERNEL(                                              \
+      reg_type, kernel_name, backend, layout, kernel_fn, dtype)
 
 #ifndef _WIN32
-#define _PT_REGISTER_GENERAL_KERNEL(                                        \
-    kernel_name, backend, layout, kernel_fn, dtype)                         \
+#define __PD_REGISTER_GENERAL_KERNEL(                                       \
+    reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   template decltype(kernel_fn) kernel_fn;                                   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
+          reg_type,                                                         \
           #kernel_name,                                                     \
-          BACKEND(backend),                                                 \
+          #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
           &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
@@ -787,14 +950,15 @@ struct KernelRegistrar {
   void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
-#define _PT_REGISTER_GENERAL_KERNEL(                                        \
-    kernel_name, backend, layout, kernel_fn, dtype)                         \
+#define __PD_REGISTER_GENERAL_KERNEL(                                       \
+    reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
+          reg_type,                                                         \
           #kernel_name,                                                     \
-          BACKEND(backend),                                                 \
+          #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
           &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
@@ -807,18 +971,48 @@ struct KernelRegistrar {
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-/** PT_DECLARE_KERNEL
+/** PD_DECLARE_KERNEL
  *
  * Used to export the symbols of the file where the kernel is located,
  * to avoid being removed by linker
  */
-#define PT_DECLARE_KERNEL(kernel_name, backend, layout)                   \
+#define PD_DECLARE_KERNEL(kernel_name, backend, layout)                   \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      pt_declare_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_DECLARE_KERNEL must be called in global namespace.");           \
+      PD_DECLARE_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PD_DECLARE_KERNEL must be called in global namespace.");           \
   extern int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout(); \
   UNUSED static int                                                       \
       __declare_kernel_symbol_for_##kernel_name##_##backend##_##layout =  \
           TouchKernelSymbolFor_##kernel_name##_##backend##_##layout()
 
+/** PD_REGISTER_BUILTIN_KERNEL
+ *
+ * Used to register kernels for built-in backends.
+ * Support CPU GPU XPU.
+ */
+#define PD_REGISTER_BUILTIN_KERNEL(                    \
+    kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PD_REGISTER_KERNEL(::phi::RegType::OUTER,           \
+                      kernel_name,                     \
+                      backend,                         \
+                      ::phi::backend##Context,         \
+                      layout,                          \
+                      meta_kernel_fn,                  \
+                      __VA_ARGS__)
+
+/** PD_REGISTER_PLUGIN_KERNEL
+ *
+ * Used to register kernels for plug-in backends.
+ * Support user-defined backend such as 'Ascend910'.
+ */
+#define PD_REGISTER_PLUGIN_KERNEL(                     \
+    kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PD_REGISTER_KERNEL(::phi::RegType::OUTER,           \
+                      kernel_name,                     \
+                      backend,                         \
+                      ::phi::CustomContext,            \
+                      layout,                          \
+                      meta_kernel_fn,                  \
+                      __VA_ARGS__)
+
 }  // namespace phi
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 8c7d096eab0916d984819cfe85810a90cd29e631..2fda3cb6db4fdb4aaac7fc7c88075b833c050bad 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/scalar.h"
@@ -22,7 +23,9 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_context.h"
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/selected_rows.h"
+#endif
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/type_defs.h"
@@ -210,13 +213,18 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 #ifdef PADDLE_WITH_XPU
   PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
+#endif
 
   /* Input Helpers */
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
+#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
@@ -237,6 +245,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
@@ -250,7 +259,9 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
+#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
diff --git a/paddle/phi/core/lod_utils.h b/paddle/phi/core/lod_utils.h
index 2b0be4d93429d222afbf28d9de0a7bced19a498b..147fca4cb576ce1625df83cca95d3701e082e6f6 100644
--- a/paddle/phi/core/lod_utils.h
+++ b/paddle/phi/core/lod_utils.h
@@ -13,12 +13,11 @@
 // limitations under the License.
 
 #pragma once
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/mixed_vector.h"
+#include <cstddef>
+#include <vector>
 
 namespace phi {
-using LoD = std::vector<paddle::framework::Vector<size_t>>;
+using LoD = std::vector<std::vector<std::size_t>>;
 
 void AppendLoD(LoD* lod, const LoD& lod_length);
 
@@ -34,4 +33,4 @@ void AppendLoD(LoD* lod, const LoD& lod_length);
  */
 LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
 
-}  // namespace  pten
+}  // namespace  phi
diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h
index cd48777b8ea61d58991923ea5919d7555d0a219b..7ee475b4d5d9e03d0931587f2a607f5f4950a426 100644
--- a/paddle/phi/core/selected_rows.h
+++ b/paddle/phi/core/selected_rows.h
@@ -55,25 +55,17 @@ class SelectedRows : public TensorBase,
 
   void set_height(int64_t height) { impl_->set_height(height); }
 
-  const paddle::framework::Vector<int64_t>& rows() const {
-    return impl_->rows();
-  }
+  const std::vector<int64_t>& rows() const { return impl_->rows(); }
 
-  paddle::framework::Vector<int64_t>* mutable_rows() {
-    return impl_->mutable_rows();
-  }
-
-  void set_rows(const paddle::framework::Vector<int64_t>& rows) {
-    impl_->set_rows(rows);
-  }
+  std::vector<int64_t>* mutable_rows() { return impl_->mutable_rows(); }
 
+  void set_rows(const std::vector<int64_t>& rows) { impl_->set_rows(rows); }
   /*
    * @brief Get the index of key in rows
    *
    * @return -1 if the key does not exists.
    */
   int64_t Index(int64_t key) const { return impl_->Index(key); }
-
   /*
    * @brief whether has the specified key in the table.
    *
diff --git a/paddle/phi/core/selected_rows_impl.cc b/paddle/phi/core/selected_rows_impl.cc
index 920e9935d5899de82eb2cdd81616f8466916d7e3..7e5fd51343a09aa4ae974ad30f3265169489862c 100644
--- a/paddle/phi/core/selected_rows_impl.cc
+++ b/paddle/phi/core/selected_rows_impl.cc
@@ -28,7 +28,7 @@ struct ReAllocateVisitor {
   template <typename T>
   void operator()() const {
     phi::DenseTensor cpu_tensor;
-    paddle::platform::CPUPlace cpu;
+    phi::CPUPlace cpu;
     T* ptr = cpu_tensor.mutable_data<T>(dims_, cpu);
     const T* old_ptr =
         tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
@@ -57,7 +57,7 @@ struct TensorCopyVisitor {
   template <typename T>
   void apply() const {
     // TODO(Yancey1989): support other place
-    paddle::platform::CPUPlace cpu;
+    phi::CPUPlace cpu;
     paddle::memory::Copy(cpu,
                          dst_->mutable_data<T>(cpu) + dst_offset_,
                          cpu,
@@ -82,7 +82,7 @@ struct TensorFillVisitor {
   template <typename T>
   void apply() const {
     // TODO(qiao): support other place
-    paddle::platform::CPUPlace cpu;
+    phi::CPUPlace cpu;
     auto* tensor_data = dst_->mutable_data<T>(cpu);
     auto* start = tensor_data + dst_offset_;
     auto* end = start + size_;
@@ -121,16 +121,16 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key,
   auto iter = id_to_index_.find(key);
   if (iter == id_to_index_.end()) {
     rwlock_->UNLock();
-    PADDLE_ENFORCE_EQ(auto_grown,
-                      true,
-                      paddle::platform::errors::NotFound(
-                          "Input key(%lld) is not found.", key));
+    PADDLE_ENFORCE_EQ(
+        auto_grown,
+        true,
+        phi::errors::NotFound("Input key(%lld) is not found.", key));
     rwlock_->WRLock();
     auto map_size = id_to_index_.size();
     auto vector_size = rows_.size();
     if (map_size != vector_size) {
       rwlock_->UNLock();
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Row map size(%zu) should be equal to rows size(%zu).",
           map_size,
           vector_size));
@@ -140,7 +140,7 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key,
       int row_num = rows_.size();
       if (row_num == value_->dims()[0]) {
         rwlock_->UNLock();
-        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Selected rows is full, then length exceed the length of first "
             "dimension (%d).",
             row_num));
@@ -187,7 +187,7 @@ void SelectedRowsImpl::Get(const phi::DenseTensor& ids,
     PADDLE_ENFORCE_EQ(
         value_width,
         value->numel() / value->dims()[0],
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output tensor should have the same shape with table "
             "except the first dimmension, excepted value width not counting "
             "the first dimension is %d, actual value width is %d.",
diff --git a/paddle/phi/core/selected_rows_impl.h b/paddle/phi/core/selected_rows_impl.h
index 86579e529371ad1289e8c792725b642b3a8e117c..3c54b59a159ddfdac25ad64f083cde97cfdd39f6 100644
--- a/paddle/phi/core/selected_rows_impl.h
+++ b/paddle/phi/core/selected_rows_impl.h
@@ -27,8 +27,6 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/mixed_vector.h"
 namespace phi {
 class SelectedRowsImpl {
   /*
@@ -68,13 +66,11 @@ class SelectedRowsImpl {
 
   void set_height(int64_t height) { height_ = height; }
 
-  const paddle::framework::Vector<int64_t>& rows() const { return rows_; }
+  const std::vector<int64_t>& rows() const { return rows_; }
 
-  paddle::framework::Vector<int64_t>* mutable_rows() { return &rows_; }
+  std::vector<int64_t>* mutable_rows() { return &rows_; }
 
-  void set_rows(const paddle::framework::Vector<int64_t>& rows) {
-    rows_ = rows;
-  }
+  void set_rows(const std::vector<int64_t>& rows) { rows_ = rows; }
 
   /*
    * @brief Get the index of key in rows
@@ -84,7 +80,7 @@ class SelectedRowsImpl {
   int64_t Index(int64_t key) const {
     auto it = std::find(rows_.begin(), rows_.end(), key);
     if (it == rows_.end()) {
-      PADDLE_THROW(paddle::platform::errors::NotFound(
+      PADDLE_THROW(phi::errors::NotFound(
           "Input id (%lld) is not in current rows table.", key));
     }
     return static_cast<int64_t>(std::distance(rows_.begin(), it));
@@ -156,10 +152,7 @@ class SelectedRowsImpl {
 
   /// \brief Returns the dims of the tensor.
   /// \return The dims of the tensor.
-  const DDim& dims() const noexcept {
-    return value_->dims();
-    // return phi::make_ddim(dims);
-  }
+  const DDim& dims() const noexcept { return value_->dims(); }
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
@@ -185,7 +178,7 @@ class SelectedRowsImpl {
   // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
   // SelectedRowsImpl are simply concated when adding together. Until a
   // SelectedRowsImpl add a Tensor, will the duplicate rows be handled.
-  paddle::framework::Vector<int64_t> rows_;
+  std::vector<int64_t> rows_;
   std::unordered_map<int64_t, int64_t>
       id_to_index_;  // should not be used when rows_ has duplicate member
   std::unique_ptr<DenseTensor> value_{nullptr};
diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc
index 1659f09248be02a74243a2de071606a9a8d5667c..ceaebe4e35b7120af160e27fca4347add941d458 100644
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -69,17 +69,17 @@ void SparseCooTensor::Resize(const DDim& dense_dims,
                              const int64_t non_zero_num) {
   PADDLE_ENFORCE_GE(non_zero_num,
                     this->nnz(),
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "the non_zero_num must be greater than or equal to the "
                         "origin non_zero_num."));
   PADDLE_ENFORCE_GE(sparse_dim,
                     1,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "the sparse_dim must be greater than or equal 1."));
   PADDLE_ENFORCE_LE(
       sparse_dim,
       dense_dims.size(),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "the sparse_dim must be less than or equal dense_dims."));
 
   DDim indices_dims = phi::make_ddim({sparse_dim, non_zero_num});
@@ -106,7 +106,7 @@ void SparseCooTensor::SetMember(const DenseTensor& non_zero_indices,
                                 const bool coalesced) {
   this->non_zero_indices_ = non_zero_indices;
   this->non_zero_elements_ = non_zero_elements;
-  this->dims_ = dims_;
+  this->dims_ = dims;
   this->coalesced_ = coalesced;
 }
 
diff --git a/paddle/phi/core/sparse_csr_tensor.cc b/paddle/phi/core/sparse_csr_tensor.cc
index 7f7cd76378cc4932063ecd105147f0bc1a9d07b7..cbf5f941b665d8ae2be58472069d2e04891afe29 100644
--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
@@ -20,7 +20,7 @@ inline void check_shape(const DDim& dims) {
   bool valid = dims.size() == 2 || dims.size() == 3;
 
   PADDLE_ENFORCE(valid,
-                 paddle::platform::errors::InvalidArgument(
+                 phi::errors::InvalidArgument(
                      "the SparseCsrTensor only support 2-D Tensor."));
 }
 #define Check(non_zero_crows, non_zero_cols, non_zero_elements, dims)          \
@@ -29,12 +29,12 @@ inline void check_shape(const DDim& dims) {
     PADDLE_ENFORCE_EQ(                                                         \
         non_zero_cols.place(),                                                 \
         non_zero_crows.place(),                                                \
-        paddle::platform::errors::InvalidArgument(                             \
+        phi::errors::InvalidArgument(                                          \
             "non_zero_crows and non_zero_cols must have the same place."));    \
     PADDLE_ENFORCE_EQ(                                                         \
         non_zero_cols.place(),                                                 \
         non_zero_elements.place(),                                             \
-        paddle::platform::errors::InvalidArgument(                             \
+        phi::errors::InvalidArgument(                                          \
             "non_zero_cols and non_zero_elements must have the same place.")); \
   }
 
@@ -77,7 +77,7 @@ void* SparseCsrTensor::AllocateFrom(Allocator* allocator,
 void SparseCsrTensor::Resize(const DDim& dense_dims,
                              const int64_t non_zero_num) {
   PADDLE_ENFORCE(this->initialized(),
-                 paddle::platform::errors::InvalidArgument(
+                 phi::errors::InvalidArgument(
                      "the SparseCsrTensor must be initialized when call Resize "
                      "function."));
   check_shape(dense_dims);
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index d5e5e2aa001fd4358bf35179316ddd7519840d05..3d2da542c74176017492bdb9f567396f81308d6a 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -20,16 +20,20 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/utils/any.h"
+#include "paddle/utils/optional.h"
 
 // Note: mixed_vector include many header now, LoD will be
 // used on CUDA device? Can we use small_vector here?
 // @zhanlve: Rollback to original LoD for now
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/fluid/framework/mixed_vector.h"
+#endif
 
 namespace phi {
 
 using DDim = phi::DDim;
-using LoD = std::vector<paddle::framework::Vector<size_t>>;
+using LoD = std::vector<std::vector<size_t>>;
 /// \brief The meta data of dense tensor. Take the structure type
 /// and use all default operations.
 ///
diff --git a/paddle/phi/core/tensor_utils.h b/paddle/phi/core/tensor_utils.h
index 04db7c0877ad81f2aa54241871fe7dca79380946..676a590ecbce23a107bcc891c37ac69406854035 100644
--- a/paddle/phi/core/tensor_utils.h
+++ b/paddle/phi/core/tensor_utils.h
@@ -31,25 +31,25 @@ class DenseTensorUtils {
     size_t bytes = tensor.numel() * SizeOf(tensor.dtype());
     PADDLE_ENFORCE_GE(tensor.capacity(),
                       bytes,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The memory size %d should be enough to meet the "
                           "volume required by metadata %d.",
                           tensor.capacity(),
                           bytes));
-    PADDLE_ENFORCE_GE(begin_idx,
-                      0,
-                      paddle::platform::errors::OutOfRange(
-                          "The start row index must be greater than 0."
-                          "But received the start index is d%.",
-                          begin_idx));
-    PADDLE_ENFORCE_LE(end_idx,
-                      tensor.dims()[0],
-                      paddle::platform::errors::OutOfRange(
-                          "The end row index is out of bound."));
+    PADDLE_ENFORCE_GE(
+        begin_idx,
+        0,
+        phi::errors::OutOfRange("The start row index must be greater than 0."
+                                "But received the start index is d%.",
+                                begin_idx));
+    PADDLE_ENFORCE_LE(
+        end_idx,
+        tensor.dims()[0],
+        phi::errors::OutOfRange("The end row index is out of bound."));
     PADDLE_ENFORCE_LT(
         begin_idx,
         end_idx,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The start row index must be less than the end row index."
             "But received the start index = %d, the end index = %d.",
             begin_idx,
diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h
index efb01d6664238f2dacf6a7860c41fd6ce58757f6..a190b222f86ac4145f7ad02eab043a03038c1096 100644
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -23,39 +23,39 @@ limitations under the License. */
 
 namespace phi {
 
-#define _PtenForEachDataTypeHelper_(callback, cpp_type, data_type) \
+#define _PhiForEachDataTypeHelper_(callback, cpp_type, data_type) \
   callback(cpp_type, data_type);
 
-#define _PtenForEachDataType_(callback)                             \
-  _PtenForEachDataTypeHelper_(callback, float, DataType::FLOAT32);  \
-  _PtenForEachDataTypeHelper_(                                      \
+#define _PhiForEachDataType_(callback)                              \
+  _PhiForEachDataTypeHelper_(callback, float, DataType::FLOAT32);   \
+  _PhiForEachDataTypeHelper_(                                       \
       callback, ::phi::dtype::float16, DataType::FLOAT16);          \
-  _PtenForEachDataTypeHelper_(                                      \
+  _PhiForEachDataTypeHelper_(                                       \
       callback, ::phi::dtype::bfloat16, DataType::BFLOAT16);        \
-  _PtenForEachDataTypeHelper_(callback, double, DataType::FLOAT64); \
-  _PtenForEachDataTypeHelper_(callback, int, DataType::INT32);      \
-  _PtenForEachDataTypeHelper_(callback, int64_t, DataType::INT64);  \
-  _PtenForEachDataTypeHelper_(callback, bool, DataType::BOOL);      \
-  _PtenForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8);  \
-  _PtenForEachDataTypeHelper_(callback, int16_t, DataType::INT16);  \
-  _PtenForEachDataTypeHelper_(callback, int8_t, DataType::INT8);    \
-  _PtenForEachDataTypeHelper_(                                      \
+  _PhiForEachDataTypeHelper_(callback, double, DataType::FLOAT64);  \
+  _PhiForEachDataTypeHelper_(callback, int, DataType::INT32);       \
+  _PhiForEachDataTypeHelper_(callback, int64_t, DataType::INT64);   \
+  _PhiForEachDataTypeHelper_(callback, bool, DataType::BOOL);       \
+  _PhiForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8);   \
+  _PhiForEachDataTypeHelper_(callback, int16_t, DataType::INT16);   \
+  _PhiForEachDataTypeHelper_(callback, int8_t, DataType::INT8);     \
+  _PhiForEachDataTypeHelper_(                                       \
       callback, ::phi::dtype::complex<float>, DataType::COMPLEX64); \
-  _PtenForEachDataTypeHelper_(                                      \
+  _PhiForEachDataTypeHelper_(                                       \
       callback, ::phi::dtype::complex<double>, DataType::COMPLEX128);
 
 template <typename Visitor>
 inline void VisitDataType(phi::DataType type, Visitor visitor) {
-#define PtenVisitDataTypeCallback(cpp_type, data_type) \
-  do {                                                 \
-    if (type == data_type) {                           \
-      visitor.template apply<cpp_type>();              \
-      return;                                          \
-    }                                                  \
+#define PhiVisitDataTypeCallback(cpp_type, data_type) \
+  do {                                                \
+    if (type == data_type) {                          \
+      visitor.template apply<cpp_type>();             \
+      return;                                         \
+    }                                                 \
   } while (0)
 
-  _PtenForEachDataType_(PtenVisitDataTypeCallback);
-#undef PtenVisitDataTypeCallback
+  _PhiForEachDataType_(PhiVisitDataTypeCallback);
+#undef PhiVisitDataTypeCallback
   PADDLE_THROW(phi::errors::Unimplemented(
       "Not supported phi::DataType(%d) as data type.", static_cast<int>(type)));
 }
diff --git a/paddle/phi/infermeta/CMakeLists.txt b/paddle/phi/infermeta/CMakeLists.txt
index c077e7b4c55636e07eaf9353d009e857c239b8ec..f7102629d213c08ecb3da1dfdd974e3354105e61 100644
--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils)
+cc_library(infermeta SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils)
 cc_library(backward_infermeta SRCS backward.cc DEPS meta_tensor convert_utils)
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index c4ae2e0b371c1336aeac69ec7eda208ce35e09d4..7d403fee94300e9517fcc517f4d088470d772e35 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -16,6 +16,54 @@ limitations under the License. */
 
 namespace phi {
 
+void BilinearTensorProductGradInferMeta(const MetaTensor& x,
+                                        const MetaTensor& y,
+                                        const MetaTensor& weight,
+                                        const MetaTensor& dout,
+                                        MetaTensor* dx,
+                                        MetaTensor* dy,
+                                        MetaTensor* dweight,
+                                        MetaTensor* dbias) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto weight_dims = weight.dims();
+  auto out_dims = dout.dims();
+
+  PADDLE_ENFORCE_EQ(
+      out_dims.size(),
+      2UL,
+      errors::InvalidArgument("The input(Out@GRAD) must be a 2D Tensor."));
+  PADDLE_ENFORCE_EQ(
+      x_dims[0],
+      out_dims[0],
+      errors::InvalidArgument(
+          "The first dimension(batch_size) of input(Out@GRAD) must be "
+          "equal to the first dimension of the Input(X)."));
+  PADDLE_ENFORCE_EQ(
+      weight_dims[0],
+      out_dims[1],
+      errors::InvalidArgument(
+          "The second dimension of input(Out@GRAD) must be equal to "
+          "the third dimension of the Input(Weight)."));
+
+  if (dx) {
+    dx->set_dims(x_dims);
+    dx->set_dtype(x.dtype());
+  }
+  if (dy) {
+    dy->set_dims(y_dims);
+    dy->set_dtype(y.dtype());
+  }
+  if (dweight) {
+    dweight->set_dims(weight_dims);
+    dweight->set_dtype(weight.dtype());
+  }
+  if (dbias) {
+    dbias->set_dims({1, out_dims[1]});
+    dbias->set_dtype(dout.dtype());
+  }
+}
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
@@ -28,4 +76,33 @@ void GeneralBinaryGradInferMeta(const MetaTensor& x,
   }
 }
 
+void GeneralTernaryGradInferMeta(const MetaTensor& x,
+                                 const MetaTensor& y,
+                                 const MetaTensor& z,
+                                 MetaTensor* dx,
+                                 MetaTensor* dy,
+                                 MetaTensor* dz) {
+  if (dx) {
+    dx->share_meta(x);
+  }
+  if (dy) {
+    dy->share_meta(y);
+  }
+  if (dz) {
+    dz->share_meta(z);
+  }
+}
+
+void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
+                                const MetaTensor& dout,
+                                int axis,
+                                MetaTensor* dx) {
+  PADDLE_ENFORCE_EQ(
+      out.dims(),
+      dout.dims(),
+      errors::InvalidArgument(
+          "Input(Out) and its gradients should have the same shape."));
+  dx->share_meta(dout);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 965c380db25ecc55c5cac072d593003e0dbe3334..c7090ed664b286e5a8d2c8e327f3c1ea37a71f04 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -20,9 +20,29 @@ limitations under the License. */
 
 namespace phi {
 
+void BilinearTensorProductGradInferMeta(const MetaTensor& x,
+                                        const MetaTensor& y,
+                                        const MetaTensor& weight,
+                                        const MetaTensor& dout,
+                                        MetaTensor* dx,
+                                        MetaTensor* dy,
+                                        MetaTensor* dweight,
+                                        MetaTensor* dbias);
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
                                 MetaTensor* dy);
 
+void GeneralTernaryGradInferMeta(const MetaTensor& x,
+                                 const MetaTensor& y,
+                                 const MetaTensor& z,
+                                 MetaTensor* dx,
+                                 MetaTensor* dy,
+                                 MetaTensor* dz);
+
+void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
+                                const MetaTensor& dout,
+                                int axis,
+                                MetaTensor* dx);
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index f79b5982f6194c8fe52b32320014add744942623..dfaabf7cae21ec9b91624211ce9b852148dd7cc2 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
@@ -22,7 +23,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   auto x_rank = static_cast<size_t>(x_dims.size());
   PADDLE_ENFORCE_EQ(true,
                     1 == x_rank || 2 == x_rank,
-                    paddle::platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "ShapeError: The dimensions of input tensor X (%s) "
                         "should be 1 or 2",
                         x_dims.to_str()));
@@ -31,7 +32,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   PADDLE_ENFORCE_EQ(
       true,
       x_rank == static_cast<size_t>(y_dims.size()),
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "ShapeError: The shape of input tensor Y: %s should match with "
           "input tenosr X: %s",
           y_dims.to_str(),
@@ -46,7 +47,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
 
   PADDLE_ENFORCE_EQ(true,
                     shape_match,
-                    paddle::platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "ShapeError: The shape of input tensor X: %s should "
                         "be exactly the same "
                         "with input tensor Y: %s",
@@ -70,12 +71,12 @@ void MatmulInferMeta(const MetaTensor& x,
   auto ndims_y = dims_y.size();
   PADDLE_ENFORCE_GT(ndims_x,
                     0UL,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The Input(x) dims size must be greater than 0,"
                         " but reviced dims size is 0. "));
   PADDLE_ENFORCE_GT(ndims_y,
                     0UL,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The Input(y) dims size must be greater than 0,"
                         " but reviced dims size is 0. "));
 
@@ -149,7 +150,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
     if (x_dims.size() == y_dims.size()) {
       PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0),
                         true,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "axis should be -1 or 0 while the dimension of "
                             "tensor X (%s) is equal to the dimension of "
                             "tensor Y (%s), but received axis: %s",
@@ -159,7 +160,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
     }
     PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim),
                       true,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The axis range must be [%s, %s), but axis is %s. "
                           "Please set the axis again.",
                           -1 * max_dim,
@@ -188,4 +189,129 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void HuberLossInferMeta(const MetaTensor& input,
+                        const MetaTensor& label,
+                        float delta,
+                        MetaTensor* out,
+                        MetaTensor* residual,
+                        MetaConfig config) {
+  auto input_dims = input.dims();
+  auto label_dims = label.dims();
+
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    label_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(input) rank and Input(label) rank should be "
+                        "same, but received input rank(%d) != label rank(%d)",
+                        input_dims.size(),
+                        label_dims.size()));
+
+  bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) ||
+                             phi::contain_unknown_dim(label_dims);
+  if (config.is_runtime || !contain_unknown_dim) {
+    PADDLE_ENFORCE_EQ(
+        input_dims,
+        label_dims,
+        phi::errors::InvalidArgument(
+            "The Input(input) and Input(label) should have the same "
+            "shape, but received input shape [%s] != label shape [%s]",
+            input_dims,
+            label_dims));
+  }
+
+  auto out_dims = label_dims;
+  residual->set_dims(out_dims);
+  out->set_dims(out_dims);
+  out->share_lod(input);
+}
+
+void CrossInferMeta(const MetaTensor& x,
+                    const MetaTensor& y,
+                    int axis,
+                    MetaTensor* out) {
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  auto dim = axis;
+
+  bool dims_match = phi::funcs::CheckDims(x_dim, y_dim);
+  PADDLE_ENFORCE_EQ(
+      dims_match,
+      true,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
+                                   "the 'shape' of Input(Y). But received "
+                                   "Input(X).dimensions = [%s], "
+                                   "Input(Y).dimensions = [%s]",
+                                   x_dim,
+                                   y_dim));
+
+  if (dim != DDim::kMaxRank) {
+    PADDLE_ENFORCE_EQ(
+        dim < x_dim.size() && dim >= (0 - x_dim.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(dim) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+            x_dim.size(),
+            x_dim.size() - 1,
+            dim));
+    if (dim < 0) {
+      dim += x_dim.size();
+    }
+    PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Input(X/Y).dims()[dim] should be equal to 3."
+                          "But received Input(X/Y).dims()[dim] = %d.",
+                          x_dim[dim]));
+  }
+  out->set_dims(x_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto in_dims = x.dims();
+  out->set_dims(in_dims);
+}
+
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto input_dims = input.dims();
+  auto label_dims = label.dims();
+
+  int rank = input_dims.size();
+  PADDLE_ENFORCE_EQ(rank,
+                    label_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) and Input(Label) shall have the same rank."
+                        "But received: the rank of Input(X) is [%d], "
+                        "the rank of Input(Label) is [%d].",
+                        rank,
+                        label_dims.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(input_dims,
+                      label_dims,
+                      phi::errors::InvalidArgument(
+                          "Input(X) and Input(Label) shall have the same "
+                          "shape. But received: the shape of Input(X) is "
+                          "[%s], the shape of Input(Label) is [%s].",
+                          input_dims,
+                          label_dims));
+  }
+
+  out->set_dims(input_dims);
+  out->set_dtype(input.dtype());
+  out->share_lod(input);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 5e3214127ee2361117a215ad7623b040599519df..02750482dccaabd53f360fcc361bfdc8e788b89e 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -45,4 +45,22 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              const MetaTensor& y_meta,
                              int axis,
                              MetaTensor* out);
+
+void HuberLossInferMeta(const MetaTensor& input_meta,
+                        const MetaTensor& label_meta,
+                        float delta,
+                        MetaTensor* out,
+                        MetaTensor* residual,
+                        MetaConfig config = MetaConfig());
+
+void CrossInferMeta(const MetaTensor& x,
+                    const MetaTensor& y,
+                    int axis,
+                    MetaTensor* out);
+
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 5e7dd1de69d7d0f3de5ef7e67dc8d1f48373abdb..7a0db3d5c17ee3cd40891601009a3841f603bb32 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -18,13 +18,79 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
 
+void BilinearTensorProductInferMeta(const MetaTensor& x,
+                                    const MetaTensor& y,
+                                    const MetaTensor& weight,
+                                    paddle::optional<const MetaTensor&> bias,
+                                    MetaTensor* out,
+                                    MetaConfig config) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto weight_dims = weight.dims();
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      2UL,
+      errors::InvalidArgument("The input(X) must be a 2D Tensor."));
+  PADDLE_ENFORCE_EQ(
+      y_dims.size(),
+      2UL,
+      errors::InvalidArgument("The input(Y) must be a 2D Tensor."));
+  PADDLE_ENFORCE_EQ(
+      weight_dims.size(),
+      3UL,
+      errors::InvalidArgument(
+          "Expected the input(Weight) is a 3D tensor. But received %dD tensor.",
+          weight_dims.size()));
+  if (config.is_runtime || (x_dims[0] > 0 && y_dims[0] > 0)) {
+    PADDLE_ENFORCE_EQ(x_dims[0],
+                      y_dims[0],
+                      errors::InvalidArgument(
+                          "The first dimension(batch_size) of input(X) must be "
+                          "equal to the first dimension of the input(Y)."));
+  }
+  PADDLE_ENFORCE_EQ(x_dims[1],
+                    weight_dims[1],
+                    errors::InvalidArgument(
+                        "The second dimension of input(X) must be equal to "
+                        "the second dimension of the input(Weight)."));
+  PADDLE_ENFORCE_EQ(y_dims[1],
+                    weight_dims[2],
+                    errors::InvalidArgument(
+                        "The second dimension of input(Y) must be equal to "
+                        "the third dimension of the input(Weight)."));
+
+  if (bias.get_ptr()) {
+    auto bias_dims = bias->dims();
+    PADDLE_ENFORCE_EQ(bias_dims.size(),
+                      2UL,
+                      errors::InvalidArgument(
+                          "The Input(Bias) must be a 2-D tensor with "
+                          "the 2nd dimension fixed to 1 (a row vector)."));
+    PADDLE_ENFORCE_EQ(bias_dims[0],
+                      1UL,
+                      errors::InvalidArgument(
+                          "The Input(Bias) must be a 2-D tensor with "
+                          "the 2nd dimension fixed to 1 (a row vector)."));
+    PADDLE_ENFORCE_EQ(bias_dims[1],
+                      weight_dims[0],
+                      errors::InvalidArgument(
+                          "The second dimension of input(Bias) must be equal "
+                          "to the first dimension of the input(Weight)."));
+  }
+
+  out->set_dims({x_dims[0], weight_dims[0]});
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
 void ConcatInferMeta(const std::vector<MetaTensor>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
                      MetaConfig config) {
   PADDLE_ENFORCE_GE(x.size(),
                     0UL,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of input meta vector should be greater"
                         "than 0."));
 
@@ -34,7 +100,7 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
   PADDLE_ENFORCE_EQ(
       axis >= -rank && axis < rank,
       true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The axis is expected to be in range of [%d, %d), but got %d",
           -rank,
           rank,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 589fc33333d0c3daf8291f75801b6484d8ddf053..a5fb2a4cbddc33b97b31a26fa29293868808875a 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,13 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
 
+void BilinearTensorProductInferMeta(const MetaTensor& x,
+                                    const MetaTensor& y,
+                                    const MetaTensor& weight,
+                                    paddle::optional<const MetaTensor&> bias,
+                                    MetaTensor* out,
+                                    MetaConfig config = MetaConfig());
+
 void ConcatInferMeta(const std::vector<MetaTensor>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
new file mode 100644
index 0000000000000000000000000000000000000000..52aeaef8438548542e2ecac4219f6eb2a8e8462b
--- /dev/null
+++ b/paddle/phi/infermeta/ternary.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/ternary.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+void AddmmInferMeta(const MetaTensor& input,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    float alpha,
+                    float beta,
+                    MetaTensor* out) {
+  auto input_dims = input.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto ndim_input = input_dims.size();
+  auto ndim_x = x_dims.size();
+  auto ndim_y = y_dims.size();
+
+  VLOG(3) << "addmm operator input.shape=" << input_dims
+          << " x.shape=" << x_dims << " y.shape=" << y_dims << " beta=" << beta
+          << " alpha=" << alpha << " ndim_input=" << ndim_input
+          << " ndim_x=" << ndim_x << " ndim_y=" << ndim_y;
+
+  PADDLE_ENFORCE_NE(
+      product(input_dims),
+      0,
+      errors::PreconditionNotMet("The Input variable 'input' has not "
+                                 "been initialized. You may need to confirm "
+                                 "if you put exe.run(startup_program) "
+                                 "after optimizer.minimize function."));
+
+  PADDLE_ENFORCE_NE(
+      product(x_dims),
+      0,
+      errors::PreconditionNotMet("The Input variable 'x' has not "
+                                 "been initialized. You may need to confirm "
+                                 "if you put exe.run(startup_program) "
+                                 "after optimizer.minimize function."));
+
+  PADDLE_ENFORCE_NE(
+      product(y_dims),
+      0,
+      errors::PreconditionNotMet("The Input variable 'y' has not "
+                                 "been initialized. You may need to confirm "
+                                 "if you put exe.run(startup_program) "
+                                 "after optimizer.minimize function."));
+  // dim check
+  PADDLE_ENFORCE_EQ(
+      ndim_input,
+      2,
+      errors::InvalidArgument("The input tensor input's dimension must be 2. "
+                              "But received input's dimension = [%s].",
+                              ndim_input));
+  PADDLE_ENFORCE_EQ(
+      ndim_x,
+      2,
+      errors::InvalidArgument("The input tensor x's dimension must be 2. "
+                              "But received x's dimension = [%s].",
+                              ndim_x));
+  PADDLE_ENFORCE_EQ(
+      ndim_y,
+      2,
+      errors::InvalidArgument("The input tensor y's dimension must be 2. "
+                              "But received y's dimension = [%s].",
+                              ndim_y));
+
+  std::vector<int64_t> output_dims;
+  output_dims.push_back(x_dims[0]);
+  output_dims.push_back(y_dims[1]);
+
+  out->set_dims(make_ddim(output_dims));
+  out->share_lod(input);
+  out->set_dtype(input.dtype());
+}
+
+}  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6223dd87aaf8e8c20c00ad72523e160ee15faee
--- /dev/null
+++ b/paddle/phi/infermeta/ternary.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace phi {
+
+// Common InferMeta Functions for ternary operators, The format like:
+//
+//   1. void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
+//                                          const MetaTensor& y,
+//                                          const MetaTensor& z,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//   Because functions in this file not only can infer shape, but also need
+//   infer lod or other useful data.
+
+void AddmmInferMeta(const MetaTensor& input,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    float alpha,
+                    float beta,
+                    MetaTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 4b13545e038f0970c5ed60ca3c4fefaeb6edba58..49fd0a343a470f2545fc563366256f4f92294297 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -14,10 +14,12 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/unary.h"
 
+#include <algorithm>
 #include <set>
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/kernels/funcs/unfold_functor.h"
 
 namespace phi {
 
@@ -25,6 +27,30 @@ void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->share_meta(x);
 }
 
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out) {
+  auto rank = x.dims().size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -rank,
+      errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  PADDLE_ENFORCE_LT(
+      axis,
+      rank,
+      phi::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  out->share_meta(x);
+}
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
@@ -37,11 +63,11 @@ void FlattenInferMeta(const MetaTensor& x,
   if (stop_axis < 0) {
     stop_axis = stop_axis + in_dims_size;
   }
-  PADDLE_ENFORCE_GE(stop_axis,
-                    start_axis,
-                    paddle::platform::errors::InvalidArgument(
-                        "The stop_axis should be greater"
-                        "than or equal to start_axis."));
+  PADDLE_ENFORCE_GE(
+      stop_axis,
+      start_axis,
+      phi::errors::InvalidArgument("The stop_axis should be greater"
+                                   "than or equal to start_axis."));
 
   int64_t outer = 1;
   std::vector<int32_t> out_shape;
@@ -73,12 +99,42 @@ void FlattenInferMeta(const MetaTensor& x,
   }
 }
 
+void GumbelSoftmaxInferMeta(const MetaTensor& x,
+                            float temperature,
+                            bool hard,
+                            int axis,
+                            MetaTensor* out) {
+  UnchangedInferMetaCheckAxis(x, axis, out);
+}
+
 void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) {
   out->set_dims(x.dims());
   out->set_dtype(out_dtype);
   out->set_layout(x.layout());
 }
 
+void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) {
+  auto dims = x.dims();
+  auto rank = dims.size();
+  PADDLE_ENFORCE_GE(rank,
+                    2,
+                    errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions. But "
+                        "received a %d dimension tensor.",
+                        rank));
+  PADDLE_ENFORCE_EQ(
+      dims[rank - 2],
+      dims[rank - 1],
+      errors::InvalidArgument(
+          "The inner-most 2 dimensions of Input(X) all should be symmetric "
+          "positive-definite matrices and have the same size. But received "
+          "X's shape[-2] = %d and shape[-1] = %d.",
+          dims[rank - 2],
+          dims[rank - 1]));
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+}
+
 void CopyToInferMeta(const MetaTensor& x,
                      Backend backend,
                      bool blocking,
@@ -92,6 +148,18 @@ void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) {
   out->set_layout(x.layout());
 }
 
+void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      product(x.dims()),
+      1UL,
+      errors::InvalidArgument("The number of elements in Input(X) should be 1."
+                              "Now the number is %d.",
+                              product(x.dims())));
+  out->set_dims(x.dims());
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
 static phi::DDim ValidateShape(const std::vector<int64_t> shape,
                                const phi::DDim& in_dims) {
   const int64_t in_size = phi::product(in_dims);
@@ -112,7 +180,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_EQ(
           unk_dim_idx,
           -1,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Only one dimension value of 'shape' in ReshapeOp can "
               "be -1. But received shape = [%s], shape[%d] is also -1.",
               phi::make_ddim(shape),
@@ -122,7 +190,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_LT(
           static_cast<int>(i),
           in_dims.size(),
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The index of 0 in `shape` must be less than "
               "the input tensor X's dimensions. "
               "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
@@ -135,7 +203,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_GT(
           shape[i],
           0,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Each dimension value of 'shape' in ReshapeOp must not "
               "be negative except one unknown dimension. "
               "But received  shape = [%s], shape[%d] = %d.",
@@ -160,7 +228,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_EQ(
           output_shape[unk_dim_idx] * capacity,
           -in_size,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The 'shape' attribute in ReshapeOp is invalid. "
               "The input tensor X'size must be divisible by known "
               "capacity of 'shape'. "
@@ -178,7 +246,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_EQ(
           capacity,
           in_size,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The 'shape' in ReshapeOp is invalid. "
               "The input tensor X'size must be equal to the capacity of "
               "'shape'. "
@@ -198,7 +266,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
     PADDLE_ENFORCE_LE(
         capacity,
         in_size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The 'shape' in ReshapeOp is invalid. "
             "The input tensor X's shape = [%s], X's capacity = %d."
             "But the target shape of Out is [%s],  the "
@@ -232,6 +300,41 @@ void InferMetaFromVecValue(const MetaTensor& x,
   }
 }
 
+void MultinomialInferMeta(const MetaTensor& x,
+                          int num_samples,
+                          bool replacement,
+                          MetaTensor* out) {
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  PADDLE_ENFORCE_GT(x_rank,
+                    0,
+                    errors::InvalidArgument(
+                        "The number of dimensions of the input probability "
+                        "distribution should be > 0, but got %d.",
+                        x_rank));
+  PADDLE_ENFORCE_LE(x_rank,
+                    2,
+                    errors::InvalidArgument(
+                        "The number of dimensions of the input probability "
+                        "distribution should be <= 2, but got %d.",
+                        x_rank));
+
+  std::vector<int64_t> out_dims(x_rank);
+  for (int64_t i = 0; i < x_rank - 1; i++) {
+    out_dims[i] = x_dim[i];
+  }
+
+  PADDLE_ENFORCE_GT(
+      num_samples,
+      0,
+      errors::InvalidArgument(
+          "The number of samples should be > 0, but got %d.", num_samples));
+  out_dims[x_rank - 1] = num_samples;
+
+  out->set_dims(make_ddim(out_dims));
+  out->set_dtype(DataType::INT64);
+}
+
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
                       MetaTensor* out,
@@ -363,7 +466,7 @@ void SplitInferMeta(const MetaTensor& x,
   PADDLE_ENFORCE_EQ(
       axis_value >= -rank && axis_value < rank,
       true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The axis is expected to be in range of [%d, %d), but got %d",
           -rank,
           rank,
@@ -382,7 +485,7 @@ void SplitInferMeta(const MetaTensor& x,
 
     PADDLE_ENFORCE_EQ(input_axis_dim % num,
                       0,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input's size along the split dimension "
                           "must be evenly divisible by Attr(num_or_sections). "
                           "But received Attr(num_or_sections) "
@@ -415,7 +518,7 @@ void SplitInferMeta(const MetaTensor& x,
     if (config.is_runtime) {
       PADDLE_ENFORCE_LE(num_of_unknow,
                         1,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Only one dimension value of Attr(num_or_sections) "
                             "in SplitOp can be -1. "
                             "But received Attr(num_or_sections) = [%s].",
@@ -429,7 +532,7 @@ void SplitInferMeta(const MetaTensor& x,
       PADDLE_ENFORCE_LT(
           sum_of_section,
           input_axis_dim,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Sum of Attr(num_or_sections) other than unknown section "
               "must be less than the input's "
               "size "
@@ -446,7 +549,7 @@ void SplitInferMeta(const MetaTensor& x,
       PADDLE_ENFORCE_EQ(
           sum_of_section,
           input_axis_dim,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Sum of Attr(num_or_sections) must be equal to the input's "
               "size "
               "along the split dimension. But received Attr(num_or_sections)"
@@ -484,6 +587,25 @@ void SplitInferMeta(const MetaTensor& x,
   }
 }
 
+void UnbindInferMeta(const MetaTensor& x,
+                     int axis,
+                     std::vector<MetaTensor>* outs) {
+  auto in_dims = x.dims();
+  std::vector<int> out_dim;
+  axis = axis < 0 ? in_dims.size() + axis : axis;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (i != axis) out_dim.push_back(in_dims[i]);
+  }
+  auto out_dims = phi::make_ddim(out_dim);
+
+  for (size_t i = 0; i < outs->size(); ++i) {
+    (*outs)[i].set_dtype(x.dtype());
+    (*outs)[i].set_dims(out_dims);
+    (*outs)[i].set_layout(x.layout());
+    (*outs)[i].share_lod(x);
+  }
+}
+
 void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) {
   int dim1 = axis1;
@@ -537,7 +659,255 @@ void TraceInferMeta(
   out->set_dims(phi::make_ddim(sizes));
 }
 
+void UnfoldInferMeta(const MetaTensor& x,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     MetaTensor* out,
+                     MetaConfig config) {
+  auto in_dims = x.dims();
+  // Only [N, C, H, W] input supported now
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      4,
+      phi::errors::InvalidArgument(
+          "Input should be 4-D tensor of format [N, C, H, W], but get %u",
+          in_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      in_dims.size() - kernel_sizes.size(),
+      2U,
+      phi::errors::InvalidArgument(
+          "The dims of X should be larger than that of kernel_sizes "
+          "by a number of 2, due to the batch size and input channel dim. "
+          "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
+          in_dims.size(),
+          kernel_sizes.size()));
+  PADDLE_ENFORCE_EQ(
+      strides.size(),
+      kernel_sizes.size(),
+      phi::errors::InvalidArgument(
+          "The dims of strides should be the same with that of kernel_sizes. "
+          "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
+          strides.size(),
+          kernel_sizes.size()));
+  PADDLE_ENFORCE_EQ(
+      paddings.size(),
+      2 * strides.size(),
+      phi::errors::InvalidArgument(
+          "The dims of paddings should be 2 times of that of strides. "
+          "But recieved dims(paddings: %u) != 2*dims(strides: %u).",
+          paddings.size(),
+          strides.size()));
+  PADDLE_ENFORCE_EQ(
+      strides.size(),
+      dilations.size(),
+      phi::errors::InvalidArgument(
+          "The dims of strides should be the same with that of dilations. "
+          "But recieved dims(strides: %u) != dims(dilations: %u).",
+          strides.size(),
+          dilations.size()));
+
+  // check kernel_sizes
+  PADDLE_ENFORCE_GT(kernel_sizes[0],
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The `kernel_sizes` should be greater than zero, "
+                        "but recieved kernel_height: %d kernel_width: %d.",
+                        kernel_sizes[0],
+                        kernel_sizes[1]));
+  PADDLE_ENFORCE_GT(kernel_sizes[1],
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The `kernel_sizes` should be greater than zero, "
+                        "but recieved kernel_height: %d kernel_width: %d.",
+                        kernel_sizes[0],
+                        kernel_sizes[1]));
+  // check strides
+  PADDLE_ENFORCE_GT(strides[0],
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The `strides` should be greater than zero, "
+                        "but recieved strides_height: %d strides_width: %d.",
+                        strides[0],
+                        strides[1]));
+  PADDLE_ENFORCE_GT(strides[1],
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The `strides` should be greater than zero, "
+                        "but recieved strides_height: %d strides_width: %d.",
+                        strides[0],
+                        strides[1]));
+  // check dilations
+  PADDLE_ENFORCE_GT(
+      dilations[0],
+      0,
+      phi::errors::InvalidArgument(
+          "The `dilations` should be greater than zero, "
+          "but recieved dilations_height: %d dilations_width: %d.",
+          dilations[0],
+          dilations[1]));
+  PADDLE_ENFORCE_GT(
+      dilations[1],
+      0,
+      phi::errors::InvalidArgument(
+          "The `dilations` should be greater than zero, "
+          "but recieved dilations_height: %d dilations_width: %d.",
+          dilations[0],
+          dilations[1]));
+
+  std::vector<int> out_dims;
+  out_dims.push_back(in_dims[0]);
+  int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
+  out_dims.push_back(output_channels);
+
+  int output_height = phi::funcs::CalcOutputSize(in_dims[2],
+                                                 kernel_sizes[0],
+                                                 dilations[0],
+                                                 paddings[0],
+                                                 paddings[2],
+                                                 strides[0]);
+  int output_width = phi::funcs::CalcOutputSize(in_dims[3],
+                                                kernel_sizes[1],
+                                                dilations[1],
+                                                paddings[1],
+                                                paddings[3],
+                                                strides[1]);
+  if (config.is_runtime) {
+    // only check output height and width in runtime
+    PADDLE_ENFORCE_GT(
+        output_height,
+        0,
+        phi::errors::InvalidArgument(
+            "The sliding blocks calculated from input spatial size "
+            "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
+            "dilations (%d, %d), is (%d, %d), which should be a "
+            "positive integer.",
+            in_dims[2],
+            in_dims[3],
+            kernel_sizes[0],
+            kernel_sizes[1],
+            strides[0],
+            strides[1],
+            dilations[0],
+            dilations[1],
+            output_height,
+            output_width));
+    PADDLE_ENFORCE_GT(
+        output_width,
+        0,
+        phi::errors::InvalidArgument(
+            "The sliding blocks calculated from input spatial size "
+            "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
+            "dilations (%d, %d), is (%d, %d), which should be a "
+            "positive integer.",
+            in_dims[2],
+            in_dims[3],
+            kernel_sizes[0],
+            kernel_sizes[1],
+            strides[0],
+            strides[1],
+            dilations[0],
+            dilations[1],
+            output_height,
+            output_width));
+  }
+  int output_col_length = output_height * output_width;
+  out_dims.push_back(output_col_length);
+  out->set_dims(phi::make_ddim(out_dims));
+}
+
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+
+  if (x_dims.size() == 1UL) {
+    int64_t size_ = x_dims[0] + std::abs(offset);
+    out->set_dims({size_, size_});
+    out->set_dtype(x.dtype());
+  } else if (x_dims.size() == 2UL) {
+    int64_t size_ = 0;
+    if (offset >= 0) {
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] < x_dims[1] - offset) {
+        size_ = x_dims[0];
+      } else {
+        size_ = x_dims[1] - offset;
+      }
+    } else {
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] + offset < x_dims[1]) {
+        size_ = x_dims[0] + offset;
+      } else {
+        size_ = x_dims[1];
+      }
+    }
+    out->set_dims({size_});
+    out->set_dtype(x.dtype());
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "The input tensor X's dimensions of DiagV2Op should be either 1 or "
+        "2, but received %d.",
+        x_dims.size()));
+  }
+}
+
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
+  out->set_dtype(DataType::INT64);
+  out->set_dims({1});
+}
+
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input should be a 4-D tensor of format [N, C, H, W] "
+                        "or [N, H, W, C], but got %u.",
+                        input_dims.size()));
+
+  const bool channel_last = (data_format == "NHWC");
+
+  if (!channel_last) {
+    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[3]));
+  }
+  auto output_dims = input_dims;
+  output_dims[0] = input_dims[0];
+  if (!channel_last) {
+    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] * upscale_factor;
+  } else {
+    output_dims[1] = input_dims[1] * upscale_factor;
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(output_dims);
+}
+
 }  // namespace phi
 
-PT_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
-PT_REGISTER_INFER_META_FN(split, phi::SplitInferMeta);
+PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
+PD_REGISTER_INFER_META_FN(split, phi::SplitInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 2ab425d42cd33ec49adf704a54e85e6f1714e19c..4fab1ec68ec1e71af5e55a9852cd68deccc09a7c 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -34,13 +34,26 @@ class MetaConfig;
 
 void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
 
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out);
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
                       MetaTensor* out);
 
+void GumbelSoftmaxInferMeta(const MetaTensor& x,
+                            float temperature,
+                            bool hard,
+                            int axis,
+                            MetaTensor* out);
+
 void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out);
 
+void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out);
+
 void CopyToInferMeta(const MetaTensor& x,
                      Backend backend,
                      bool blocking,
@@ -48,10 +61,17 @@ void CopyToInferMeta(const MetaTensor& x,
 
 void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
 
+void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out);
+
 void InferMetaFromVecValue(const MetaTensor& x,
                            const std::vector<int64_t>& shape,
                            MetaTensor* out);
 
+void MultinomialInferMeta(const MetaTensor& x,
+                          int num_samples,
+                          bool replacement,
+                          MetaTensor* out);
+
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
                       MetaTensor* out,
@@ -90,7 +110,30 @@ void SplitInferMeta(const MetaTensor& x_meta,
                     std::vector<MetaTensor>* out,
                     MetaConfig config = MetaConfig());
 
+void UnbindInferMeta(const MetaTensor& x,
+                     int axis,
+                     std::vector<MetaTensor>* outs);
 void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
 
+void UnfoldInferMeta(const MetaTensor& x,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     MetaTensor* out,
+                     MetaConfig config = MetaConfig());
+
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out);
+
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 4f78a6500f434c130558059554a29cd559527a11..ef51d6daf6a0052f39c2cf6253c208412cbb6904 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -3,22 +3,30 @@ set(kernel_declare_file_final ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declaratio
 file(WRITE ${kernel_declare_file} "// Generated by the paddle/phi/kernels/CMakeLists.txt.  DO NOT EDIT!\n\n#pragma once\n\n")
 file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h\"\n\n")
 
-# pten functors and functions called by kernels
+# phi functors and functions called by kernels
 add_subdirectory(funcs)
 
-# pten depends all pten kernel targets
-set_property(GLOBAL PROPERTY PTEN_KERNELS "")
+# phi depends all phi kernel targets
+set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax)
 # remove this dep after removing fluid deps on tensor creation
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
 
-# auto build kernel targets by cmake
-register_kernels(DEPS ${COMMON_KERNEL_DEPS})
+# NOTE: Some kernels depend on some targets that are not commonly used.
+# These targets are not suitable for common dependencies.
+# In this case, you need to manually generate them here.
+set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel)
+kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
+kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 
-# pten sparse kernels
+# auto parse and build kernel targets by cmake
+register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS})
+
+# phi sparse kernels
 add_subdirectory(sparse)
 
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
diff --git a/paddle/phi/kernels/addmm_grad_kernel.h b/paddle/phi/kernels/addmm_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d2f445a61de0cb186bdd7fbe7a8a7c0bce2869e
--- /dev/null
+++ b/paddle/phi/kernels/addmm_grad_kernel.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddmmGradKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     float alpha,
+                     float beta,
+                     DenseTensor* input_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/diag_v2_op.h b/paddle/phi/kernels/addmm_kernel.h
similarity index 55%
rename from paddle/fluid/operators/diag_v2_op.h
rename to paddle/phi/kernels/addmm_kernel.h
index f0bf04badab79db3ff6c72ea47f4b212832c041f..3674305796cde35f164289f5f405fee4c30e1216 100644
--- a/paddle/fluid/operators/diag_v2_op.h
+++ b/paddle/phi/kernels/addmm_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,21 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
-using DDim = framework::DDim;
+template <typename T, typename Context>
+void AddmmKernel(const Context& dev_ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 float alpha,
+                 float beta,
+                 DenseTensor* out);
 
-static inline int ComputeStride(int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis + 1; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/atan2_grad_kernel.h b/paddle/phi/kernels/atan2_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddd87c9da156d4b9ff983972010b90a74a231c4a
--- /dev/null
+++ b/paddle/phi/kernels/atan2_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Atan2GradKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/atan2_kernel.h b/paddle/phi/kernels/atan2_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..38276fa4f73ce5c0c94383a90e6f6f711efd9bcf
--- /dev/null
+++ b/paddle/phi/kernels/atan2_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Atan2Kernel(const Context& ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bce_loss_grad_kernel.h b/paddle/phi/kernels/bce_loss_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..14bf52196ac40d81bb925c3fa10c021f173d5218
--- /dev/null
+++ b/paddle/phi/kernels/bce_loss_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BCELossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       DenseTensor* input_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bce_loss_kernel.h b/paddle/phi/kernels/bce_loss_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6459ea911666e7151c2e2fc6645f4a477215f82b
--- /dev/null
+++ b/paddle/phi/kernels/bce_loss_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BCELossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h b/paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..499aa1e0b2ea958935c20bc9bbcde89d6a15d9a4
--- /dev/null
+++ b/paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BilinearTensorProductGradKernel(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& y,
+                                     const DenseTensor& weight,
+                                     const DenseTensor& dout,
+                                     DenseTensor* dx,
+                                     DenseTensor* dy,
+                                     DenseTensor* dweight,
+                                     DenseTensor* dbias);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bilinear_tensor_product_kernel.h b/paddle/phi/kernels/bilinear_tensor_product_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b34e8946ddd58e0431e52804d0f621d3eb25720c
--- /dev/null
+++ b/paddle/phi/kernels/bilinear_tensor_product_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BilinearTensorProductKernel(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& weight,
+                                 paddle::optional<const DenseTensor&> bias,
+                                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/randperm_op.cu b/paddle/phi/kernels/cholesky_grad_kernel.h
similarity index 54%
rename from paddle/fluid/operators/randperm_op.cu
rename to paddle/phi/kernels/cholesky_grad_kernel.h
index 7ed52a8fd25b104f50446082ff3a040e90bf44ea..b170a3d7ffcfacdf8186d0f54450a38b536949d5 100644
--- a/paddle/fluid/operators/randperm_op.cu
+++ b/paddle/phi/kernels/cholesky_grad_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/randperm_op.h"
+#pragma once
 
-template <typename T>
-using kernel =
-    paddle::operators::RandpermKernel<paddle::platform::CUDADeviceContext, T>;
+#include "paddle/phi/core/dense_tensor.h"
 
-REGISTER_OP_CUDA_KERNEL(randperm, kernel<int64_t>, kernel<int>, kernel<float>,
-                        kernel<double>);
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out,
+                        const DenseTensor& out_grad,
+                        bool upper,
+                        DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cholesky_kernel.h b/paddle/phi/kernels/cholesky_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dc1473d8dbcad895abefccb7d034d686eed1775
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/eye_op.cu b/paddle/phi/kernels/complex_grad_kernel.h
similarity index 51%
rename from paddle/fluid/operators/eye_op.cu
rename to paddle/phi/kernels/complex_grad_kernel.h
index 8d55235a54c70b1a4db4bd7f355332c923207591..505d4d374424141ad71da863d1fd7a6424fb35ef 100644
--- a/paddle/fluid/operators/eye_op.cu
+++ b/paddle/phi/kernels/complex_grad_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eye_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-namespace plf = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    eye, ops::EyeKernel<plf::CUDADeviceContext, float>,
-    ops::EyeKernel<plf::CUDADeviceContext, double>,
-    ops::EyeKernel<plf::CUDADeviceContext, int64_t>,
-    ops::EyeKernel<plf::CUDADeviceContext, int>,
-    ops::EyeKernel<plf::CUDADeviceContext, paddle::platform::float16>);
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RealGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx);
+
+template <typename T, typename Context>
+void ImagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index cfe9da23880363ccddc84ec39beb9038170e76cb..44bfae9820aa84cb33784f108ace6aa0ab8b5281 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -50,4 +50,14 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return x;
 }
 
+template <typename T, typename DeviceContext>
+void RealKernel(const DeviceContext& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out);
+
+template <typename T, typename DeviceContext>
+void ImagKernel(const DeviceContext& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/copy_kernel.h b/paddle/phi/kernels/copy_kernel.h
index a3ba6eabcdd694aa5bfca4a0ee669ccca086e78f..95df29f7e653af4d27fccc009da1fcdaa2264f59 100644
--- a/paddle/phi/kernels/copy_kernel.h
+++ b/paddle/phi/kernels/copy_kernel.h
@@ -22,6 +22,7 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
           const DenseTensor& src,
+          Place dst_place,
           bool blocking,
           DenseTensor* dst);
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/abs_grad_kernel.cc b/paddle/phi/kernels/cpu/abs_grad_kernel.cc
index 3c90a348d86a4ccdc1f6a5c1cd53815e00e1fa79..ca42a5eb2976f62708544e3d3bdd31f63d2a004f 100644
--- a/paddle/phi/kernels/cpu/abs_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_grad_kernel.cc
@@ -19,7 +19,7 @@
 
 using phi::dtype::complex;
 
-PT_REGISTER_KERNEL(abs_grad,
+PD_REGISTER_KERNEL(abs_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AbsGradKernel,
@@ -29,7 +29,7 @@ PT_REGISTER_KERNEL(abs_grad,
                    int64_t,
                    complex<float>,
                    complex<double>) {}
-PT_REGISTER_KERNEL(abs_double_grad,
+PD_REGISTER_KERNEL(abs_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AbsDoubleGradKernel,
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index 97bd89832870cc1d2a9031c266441bfa4c732ef2..efe7d090405df72ce07b2b2bb7f045977d982eff 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/abs_kernel.h"
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -29,14 +29,14 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
       out, size_t(x.numel() * sizeof(phi::funcs::Real<T>)));
   auto* out_data = out->data<phi::funcs::Real<T>>();
 
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::funcs::AbsFunctor<T> functor(x_data, out_data, numel);
   for_range(functor);
 }
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(abs,
+PD_REGISTER_KERNEL(abs,
                    CPU,
                    ALL_LAYOUT,
                    phi::AbsKernel,
diff --git a/paddle/phi/kernels/cpu/addmm_grad_kernel.cc b/paddle/phi/kernels/cpu/addmm_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6032f15e0f75e87fc491212361f77d46f98c9ea3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/addmm_grad_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    addmm_grad, CPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/addmm_kernel.cc b/paddle/phi/kernels/cpu/addmm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff86b655ed3ef2d195c5d6c6e49883f364bcc2e6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/addmm_kernel.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(addmm, CPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ff7431f0c8c556770b54e1328251e5996850fc9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(atan2_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Atan2GradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb38a6c90b7938ef16cf9d56dfdb93903cc3c6a1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/atan2_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
+PD_REGISTER_KERNEL(atan2,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Atan2Kernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6859451e8be32d6d70003d6ce790810d1cc815aa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bce_loss_grad_kernel.h"
+
+#include <algorithm>  // for max
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BCELossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       DenseTensor* input_grad) {
+  auto dx_data = dev_ctx.template Alloc<T>(input_grad);
+  auto dout_data = out_grad.data<T>();
+  auto x_data = input.data<T>();
+  auto label_data = label.data<T>();
+
+  int x_numel = input.numel();
+
+  // dx = dout * ((x - label)/(x - x^2))
+  for (int i = 0; i < x_numel; ++i) {
+    dx_data[i] =
+        dout_data[i] * ((x_data[i] - label_data[i]) /
+                        std::max((static_cast<T>(1) - x_data[i]) * x_data[i],
+                                 static_cast<T>(1e-12)));
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bce_loss_grad, CPU, ALL_LAYOUT, phi::BCELossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/bce_loss_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76b979365148468c883962f07db1b923e7ef25b8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bce_loss_kernel.h"
+
+#include <algorithm>  // for max
+#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BCELossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   DenseTensor* out) {
+  auto x_data = input.data<T>();
+  auto label_data = label.data<T>();
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto x_numel = input.numel();
+
+  // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 -
+  // x) - label * ln(x)
+  for (int64_t i = 0; i < x_numel; ++i) {
+    PADDLE_ENFORCE_GE(
+        x_data[i],
+        static_cast<T>(0),
+        phi::errors::InvalidArgument(
+            "Illegal input, input must be greater than  or equal to 0"));
+    PADDLE_ENFORCE_LE(
+        x_data[i],
+        static_cast<T>(1),
+        phi::errors::InvalidArgument(
+            "Illegal input, input must be less than or equal to 1"));
+    out_data[i] =
+        (label_data[i] - static_cast<T>(1)) *
+            std::max(paddle::operators::real_log(static_cast<T>(1) - x_data[i]),
+                     (T)(-100)) -
+        label_data[i] *
+            std::max(paddle::operators::real_log(x_data[i]), (T)(-100));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bce_loss, CPU, ALL_LAYOUT, phi::BCELossKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/bernoulli_kernel.cc b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
index 4ba965a4e5f1d2beb6a114b64ca5fa211804bbcb..09c07d9ec9dea028bd3b1921056b78bc97c07ec2 100644
--- a/paddle/phi/kernels/cpu/bernoulli_kernel.cc
+++ b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
@@ -51,5 +51,5 @@ void BernoulliKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     bernoulli, CPU, ALL_LAYOUT, phi::BernoulliKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2268212316af68433a18d9037136e3e0f733e4dc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(bilinear_tensor_product_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BilinearTensorProductGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25bc5913865a0717024c3bfe24281ab3b110b159
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bilinear_tensor_product_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(bilinear_tensor_product,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BilinearTensorProductKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc
index 4e95a37270dd43a4f3f45eb3a26b1c0500e0aaf2..c2c207bfaf25e5bea9faed36c85a5755884e5669 100644
--- a/paddle/phi/kernels/cpu/cast_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_kernel.cc
@@ -58,7 +58,7 @@ void CastKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(cast,
+PD_REGISTER_KERNEL(cast,
                    CPU,
                    ALL_LAYOUT,
                    phi::CastKernel,
diff --git a/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad9d51db4921e263fd959271e053a8324c52bb64
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc
@@ -0,0 +1,22 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    cholesky_grad, CPU, ALL_LAYOUT, phi::CholeskyGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_kernel.cc b/paddle/phi/kernels/cpu/cholesky_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d9b6b52d75d6924e091c733f2a051f9281b83b2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_kernel.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/cholesky_kernel.h"
+
+#include "Eigen/Cholesky"
+#include "Eigen/Core"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out) {
+  using EigenMatrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using InputMatrixMap = Eigen::Map<const EigenMatrix>;
+  using OutputMatrixMap = Eigen::Map<EigenMatrix>;
+
+  auto& dims = x.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  auto m = dims[dims.size() - 1];
+
+  const auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  // Cholesky decomposition for each matrix, maybe can use multi threads
+  for (int i = 0; i < batch_count; i++) {
+    auto input = InputMatrixMap(x_data + i * m * m, m, m);
+    auto output = OutputMatrixMap(out_data + i * m * m, m, m);
+    if (upper) {
+      Eigen::LLT<
+          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
+          Eigen::UpLoType::Upper>
+          llt_decomposition(input);
+      PADDLE_ENFORCE_EQ(llt_decomposition.info(),
+                        Eigen::Success,
+                        errors::InvalidArgument(
+                            "Cholesky decomposition was not successful. The "
+                            "%d-th input matrice "
+                            "might not be not be positive definite.",
+                            i));
+      output = llt_decomposition.matrixU();
+    } else {
+      Eigen::LLT<
+          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
+          Eigen::UpLoType::Lower>
+          llt_decomposition(input);
+      PADDLE_ENFORCE_EQ(llt_decomposition.info(),
+                        Eigen::Success,
+                        errors::InvalidArgument(
+                            "Cholesky decomposition was not successful. The "
+                            "%d-th input matrice "
+                            "might not be not be positive definite.",
+                            i));
+      output = llt_decomposition.matrixL();
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cholesky, CPU, ALL_LAYOUT, phi::CholeskyKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/complex_grad_kernel.cc b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c1d50f5bf27d2cf7b3e0078f1bcab13d1b898a8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/complex_grad_kernel.h"
+#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(real_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RealGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(imag_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ImagGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index 3a886c3378524c62c53aae9951de4db17aad9acc..801502e16737d1ef5ffa475916d5e144d2e8d86b 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -21,7 +21,7 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(conj,
+PD_REGISTER_KERNEL(conj,
                    CPU,
                    ALL_LAYOUT,
                    phi::ConjKernel,
@@ -31,3 +31,17 @@ PT_REGISTER_KERNEL(conj,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(real,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RealKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(imag,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ImagKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/concat_and_split.h b/paddle/phi/kernels/cpu/concat_and_split.h
deleted file mode 100644
index 88cfc5db8f2e852ee26f2300afb5a93cf06274c1..0000000000000000000000000000000000000000
--- a/paddle/phi/kernels/cpu/concat_and_split.h
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace phi {
-
-/*
- * \brief Concatenate the input tensors along the dimension axis.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input[0] = [[1,2],[3,4]]
- *     Input[1] = [[5,6]]
- *     axis = 0
- *
- *     Output = [[1,2],
- *               [3,4],
- *               [5,6]]
- */
-
-template <typename T, typename Context>
-void ConcatImpl(const Context& context,
-                const std::vector<DenseTensor>& input,
-                int axis,
-                DenseTensor* output) {
-  // TODO(zcd): Add input data validity checking
-  size_t num = input.size();
-
-  int64_t rows = 1;
-  auto dim_0 = input[0].dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int64_t out_rows = rows, out_cols = 0;
-
-  std::vector<int64_t> input_cols(input.size());
-  for (size_t i = 0; i < num; ++i) {
-    int64_t t_cols = input[i].numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
-  }
-  auto cpu_place = context.GetPlace();
-
-  // computation
-  auto output_data = output->data<T>();
-  int64_t col_idx = 0;
-  for (size_t j = 0; j < num; ++j) {
-    int64_t col_len = input_cols[j];
-    auto input_data = input[j].data<T>();
-    for (int64_t k = 0; k < out_rows; ++k) {
-      paddle::memory::Copy(cpu_place,
-                           output_data + k * out_cols + col_idx,
-                           cpu_place,
-                           input_data + k * col_len,
-                           sizeof(T) * col_len);
-    }
-    col_idx += col_len;
-  }
-}
-
-/*
- * \brief Split the input tensors along the dimension axis into outputs.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input = [[1,2],
- *              [3,4],
- *              [5,6]]
- *     axis = 0
- *
- *     Output[0] = [[1,2],[3,4]]
- *     Output[1] = [[5,6]]
- */
-template <typename T, typename Context>
-void SplitImpl(const Context& context,
-               const DenseTensor& input,
-               const std::vector<const DenseTensor*>& ref_inputs,
-               const int axis,
-               std::vector<DenseTensor*>* outputs) {
-  // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-  // tensors of shape [0,1,4]
-  if (input.numel() == 0) {
-    return;
-  }
-
-  // TODO(zcd): Add input data validity checking
-  size_t num = outputs->size();
-
-  int input_rows = 1;
-  auto dim_0 = ref_inputs[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    input_rows *= dim_0[i];
-  }
-
-  int input_cols = 0;
-
-  std::vector<int64_t> output_cols(outputs->size());
-  for (size_t i = 0; i < num; ++i) {
-    int t_cols = ref_inputs[i]->numel() / input_rows;
-    input_cols += t_cols;
-    output_cols[i] = t_cols;
-  }
-  auto cpu_place = context.GetPlace();
-
-  // computation
-  for (int k = 0; k < input_rows; ++k) {
-    const T* src_ptr = input.data<T>() + k * input_cols;
-    int col_idx = 0;
-    for (size_t j = 0; j < num; ++j) {
-      int col_len = output_cols[j];
-      auto* out_tensor = outputs->at(j);
-      if (out_tensor != nullptr) {
-        T* dst_ptr = out_tensor->data<T>() + k * col_len;
-        paddle::memory::Copy(cpu_place,
-                             dst_ptr,
-                             cpu_place,
-                             src_ptr + col_idx,
-                             sizeof(T) * col_len);
-      }
-      col_idx += col_len;
-    }
-  }
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 7f4cce379e04d4744f2544788feec28ba0a915e2..18bb8837b105d91e3e13a0a7519b08c9c47202c4 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -22,7 +22,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/lod_utils.h"
-#include "paddle/phi/kernels/cpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
 namespace phi {
@@ -54,7 +54,7 @@ void ConcatKernel(const Context& dev_ctx,
         PADDLE_ENFORCE_EQ(
             x[i].lod().size(),
             lod_size_0,
-            paddle::platform::errors::Unimplemented(
+            phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
                 "Maybe different lod level of input LoDTensors can concat,"
                 "it is not supported currently. The lod level of %dth input "
@@ -104,13 +104,14 @@ void ConcatKernel(const Context& dev_ctx,
         continue;
       }
     }
-    ConcatImpl<T, Context>(dev_ctx, inputs, axis, out);
+    phi::funcs::ConcatFunctor<Context, T> functor;
+    functor(dev_ctx, inputs, axis, out);
   }
 }
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(concat,
+PD_REGISTER_KERNEL(concat,
                    CPU,
                    ALL_LAYOUT,
                    phi::ConcatKernel,
diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc
index 8a79a5f6b1941e1fcd24d5a1f05d1094628ca28d..1af071f23ddc520e6733acdbeec3a0652f4e1d8f 100644
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -28,6 +28,7 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
           const DenseTensor& src,
+          Place dst_place,
           bool blocking,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
@@ -56,5 +57,5 @@ void Copy(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     copy, CPU, ALL_LAYOUT, phi::Copy<phi::CPUContext>, ALL_DTYPE) {}
diff --git a/paddle/phi/kernels/cpu/cross_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..390420008e6ea107573bbc2038c3a82af19b06e6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cross_grad_kernel.h"
+#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cross_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CrossGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a63f33174eacda551e595affc34343030468f2c5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cross_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cross_kernel.h"
+#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    cross, CPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1e0b8e31e78fd74e6a15722546971a3cb72807a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/diag_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int offset,
+                float padding_value,
+                DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  auto x_dims = x.dims();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  auto out_dims = out->dims();
+
+  int64_t i;
+  if (x_dims.size() == 1) {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+    auto x_length = x_dims[0];
+    const int& x_stride = phi::funcs::ComputeStride(0, x_dims);
+
+    auto out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
+    auto out_stride_1 = phi::funcs::ComputeStride(1, out_dims);
+    out_data += (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+    for (i = 0; i < x_length; i++) {
+      out_data[i * (out_stride_0 + out_stride_1)] = x_data[i * x_stride];
+    }
+  } else {
+    auto out_length = out_dims[0];
+    const int& x_stride_0 = phi::funcs::ComputeStride(0, x_dims);
+    const int& x_stride_1 = phi::funcs::ComputeStride(1, x_dims);
+
+    auto out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
+    x_data += (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+    for (i = 0; i < out_length; i++) {
+      out_data[i * out_stride_0] = x_data[i * (x_stride_0 + x_stride_1)];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    diag, CPU, ALL_LAYOUT, phi::DiagKernel, int, float, double, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
index 351b2335386a8b60c725c43d80bff8fc5872eb16..c3c290b4fe91ec1ecee6f0026ed5af39288e2618 100644
--- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -82,7 +82,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PT_REGISTER_KERNEL(diagonal_grad,
+PD_REGISTER_KERNEL(diagonal_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::DiagonalGradKernel,
diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc
index 79f09008f3e2e48cce5ec4f431b6541450c3d710..df17b458e1166b49815d405a4e7d97c5384ab4f0 100644
--- a/paddle/phi/kernels/cpu/diagonal_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc
@@ -79,7 +79,7 @@ void DiagonalKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PT_REGISTER_KERNEL(diagonal,
+PD_REGISTER_KERNEL(diagonal,
                    CPU,
                    ALL_LAYOUT,
                    phi::DiagonalKernel,
diff --git a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
index 5cb86eef498bd325c8beda7c08f5e76b57f417b0..da1b5ae556609c05a91623cf9cac408e190868b9 100644
--- a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
@@ -19,5 +19,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/digamma_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma_grad, CPU, ALL_LAYOUT, phi::DigammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/digamma_kernel.cc b/paddle/phi/kernels/cpu/digamma_kernel.cc
index 0013d8ee7740b8a396ebf127698b6be0b53067d0..ee120a29b6061efcadfb88ecce8ba3235d865ca1 100644
--- a/paddle/phi/kernels/cpu/digamma_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_kernel.cc
@@ -19,5 +19,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/digamma_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma, CPU, ALL_LAYOUT, phi::DigammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dot_grad_kernel.cc b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
index 729bc9aa3a3acad547269613cbfb66e75ff20ead..a2abdb7c00900ecd103562430d1f965cbaf92d4e 100644
--- a/paddle/phi/kernels/cpu/dot_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(dot_grad,
+PD_REGISTER_KERNEL(dot_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::DotGradKernel,
diff --git a/paddle/phi/kernels/cpu/dot_kernel.cc b/paddle/phi/kernels/cpu/dot_kernel.cc
index f4f5d1ffeb544dfa006444ce746e076c1d6258ae..3518501a6b63d160d32ecefc57236d4e2aa7b1fa 100644
--- a/paddle/phi/kernels/cpu/dot_kernel.cc
+++ b/paddle/phi/kernels/cpu/dot_kernel.cc
@@ -49,7 +49,7 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(dot,
+PD_REGISTER_KERNEL(dot,
                    CPU,
                    ALL_LAYOUT,
                    phi::DotKernel,
diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h
index c692038d24a0a885d21b9c632709b143681a438d..28bf5ab743f6d5d0608fe65c00d5a0de2af3415b 100644
--- a/paddle/phi/kernels/cpu/elementwise.h
+++ b/paddle/phi/kernels/cpu/elementwise.h
@@ -127,7 +127,7 @@ struct SameDimsDivideFunctor<
                   const DenseTensor& x,
                   const DenseTensor& y,
                   DenseTensor* z) {
-    paddle::platform::errors::InvalidArgument(
+    phi::errors::InvalidArgument(
         "If use SameDimsDivideFunctor, template args(T) must be floating "
         "point. ");
   }
@@ -278,12 +278,10 @@ void CommonForwardBroadcastCPU(const DenseTensor& x,
   std::vector<int> index_array(max_dim, 0);
   const T* x_data = x.data<T>();
   const T* y_data = y.data<T>();
-  PADDLE_ENFORCE_NOT_NULL(x_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input X should not be empty."));
-  PADDLE_ENFORCE_NOT_NULL(y_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input Y should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(
+      x_data, phi::errors::InvalidArgument("The input X should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(
+      y_data, phi::errors::InvalidArgument("The input Y should not be empty."));
   OutType* out_data = ctx.Alloc<OutType>(z);
 
   const int out_size = std::accumulate(
@@ -317,12 +315,12 @@ void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
@@ -385,12 +383,12 @@ void ElementwiseCompute(const CPUContext& dev_ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
@@ -630,12 +628,12 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index 2d1b2a3bd7c3fa4d40d6544a704ef984d7fac1fc..c878e8133ffc0dc0c5e4992b315af48bc6cdaf03 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -125,7 +125,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(add_grad,
+PD_REGISTER_KERNEL(add_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddGradKernel,
@@ -137,7 +137,7 @@ PT_REGISTER_KERNEL(add_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_double_grad,
+PD_REGISTER_KERNEL(add_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddDoubleGradKernel,
@@ -149,7 +149,7 @@ PT_REGISTER_KERNEL(add_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_triple_grad,
+PD_REGISTER_KERNEL(add_triple_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddTripleGradKernel,
@@ -161,7 +161,7 @@ PT_REGISTER_KERNEL(add_triple_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_grad,
+PD_REGISTER_KERNEL(subtract_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractGradKernel,
@@ -170,10 +170,11 @@ PT_REGISTER_KERNEL(subtract_grad,
                    int16_t,
                    int,
                    int64_t,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_double_grad,
+PD_REGISTER_KERNEL(subtract_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractDoubleGradKernel,
@@ -182,5 +183,6 @@ PT_REGISTER_KERNEL(subtract_double_grad,
                    int16_t,
                    int,
                    int64_t,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1fe4f026ab07ef2370c2c69ac10a3a9c831c6a3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/erfinv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    erfinv_grad, CPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/erfinv_kernel.cc b/paddle/phi/kernels/cpu/erfinv_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f3a740f9d9be3e68c5e7d3a13933d6b09cdbc75
--- /dev/null
+++ b/paddle/phi/kernels/cpu/erfinv_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/erfinv_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(erfinv, CPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/expand_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
index 427b6441b2d24c8ea1862cb7ae0168a3009c54dc..4799a6aa7afdf85a759d5940edea05e885b965e3 100644
--- a/paddle/phi/kernels/cpu/expand_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand_grad,
+PD_REGISTER_KERNEL(expand_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::ExpandGradKernel,
diff --git a/paddle/phi/kernels/cpu/expand_kernel.cc b/paddle/phi/kernels/cpu/expand_kernel.cc
index cce367c8eb832469a223c4c54d462b6f7c9b4237..077048976729fddefe8162f8eebb4961843dd2e0 100644
--- a/paddle/phi/kernels/cpu/expand_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/expand_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand,
+PD_REGISTER_KERNEL(expand,
                    CPU,
                    ALL_LAYOUT,
                    phi::ExpandKernel,
diff --git a/paddle/phi/kernels/cpu/eye_kernel.cc b/paddle/phi/kernels/cpu/eye_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0d0f2c43909690078ff268356242b557dd6e6aa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eye_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eye_kernel.h"
+#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(eye,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EyeKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/flip_kernel.cc b/paddle/phi/kernels/cpu/flip_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa1625d65bdc9b2604ba405744fe3def7a2e7282
--- /dev/null
+++ b/paddle/phi/kernels/cpu/flip_kernel.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/flip_kernel.h"
+
+#include <bitset>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+constexpr size_t dim_bitset_size = 64;
+
+template <typename T, typename Context>
+void FlipKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int>& axis,
+                DenseTensor* out) {
+  auto x_dims = x.dims();
+  const int total_dims = x_dims.size();
+  std::bitset<dim_bitset_size> dim_bitset;
+  for (size_t i = 0; i < axis.size(); ++i) {
+    int dim = axis[i];
+    if (axis[i] < 0) {
+      dim += total_dims;
+    }
+    dim_bitset[dim] = true;
+  }
+  auto x_strides = phi::stride(x_dims);
+  auto numel = x.numel();
+  const T* x_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int64_t i = 0; i < numel; ++i) {
+    int64_t cur_indices = i;
+    int64_t rem = 0;
+    int64_t dst_offset = 0;
+
+    for (int d = 0; d < total_dims; ++d) {
+      int64_t temp = cur_indices;
+      cur_indices = cur_indices / x_strides[d];
+      rem = temp - cur_indices * x_strides[d];
+      dst_offset += dim_bitset[d] ? (x_dims[d] - 1 - cur_indices) * x_strides[d]
+                                  : cur_indices * x_strides[d];
+      cur_indices = rem;
+    }
+    out_data[i] = x_data[dst_offset];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(flip,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FlipKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   bool,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index b55eb109f7de32ced5c8a316edd6aa2811b7e77d..86576a861aa4834a4b39b50594565a2d4b3ac510 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -35,7 +35,7 @@ void FullKernel(const Context& dev_ctx,
                 const Scalar& val,
                 DataType dtype,
                 DenseTensor* out) {
-  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
+  out->Resize(phi::make_ddim(shape.GetData()));
   FullValue<T>(dev_ctx, out, val.to<T>());
 }
 
@@ -73,7 +73,7 @@ void FullLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full,
+PD_REGISTER_KERNEL(full,
                    CPU,
                    ALL_LAYOUT,
                    phi::FullKernel,
@@ -89,7 +89,7 @@ PT_REGISTER_KERNEL(full,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(full_like,
+PD_REGISTER_KERNEL(full_like,
                    CPU,
                    ALL_LAYOUT,
                    phi::FullLikeKernel,
@@ -99,4 +99,6 @@ PT_REGISTER_KERNEL(full_like,
                    int,
                    int64_t,
                    bool,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4c131e72b59a9b6a975dbb7f43d33321ae9a549
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(gumbel_softmax_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GumbelSoftmaxGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb406665c5f4f63a67ea84f5516b93fc82644e67
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gumbel_softmax_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct GumbleNoiseGenerator<CPUContext, T> {
+  static void Transform(const CPUContext& ctx,
+                        const T* input_data,
+                        T* output_data,
+                        int size_to_axis,
+                        int size_from_axis,
+                        const float temperature) {
+    // generate uniform random number
+    const int size = size_to_axis * size_from_axis;
+    std::uniform_real_distribution<T> dist(0.00001, 1);
+    auto engine = ctx.GetGenerator()->GetCPUEngine();
+    DenseTensor random_tensor;
+    random_tensor.Resize(make_ddim({size}));
+    auto* random_data = ctx.template Alloc<T>(&random_tensor);
+    for (int64_t i = 0; i < size; ++i) {
+      random_data[i] = dist(*engine);
+    }
+
+    // generate gumbel noise
+    DDim dim_2d{size_to_axis, size_from_axis};
+    auto gumbel_noise_eigen = EigenMatrix<T>::From(random_tensor, dim_2d);
+    gumbel_noise_eigen = -(((-(gumbel_noise_eigen.log())).log()));
+
+    // add noise
+    for (int64_t i = 0; i < size_to_axis * size_from_axis; i++) {
+      output_data[i] = (input_data[i] + random_data[i]) / temperature;
+    }
+  }
+};
+
+template <typename T>
+struct OneHotGenerator<CPUContext, T> {
+  static void Transform(const CPUContext& ctx,
+                        const DenseTensor& x,
+                        DenseTensor* out,
+                        int axis) {
+    DenseTensor index;
+    std::vector<int> index_dim;
+    const auto rank = x.dims().size();
+    const int size_to_axis = funcs::SizeToAxis(axis, x.dims());
+    const int size_from_axis = funcs::SizeFromAxis(axis, x.dims());
+    const int size_out_axis = funcs::SizeOutAxis(axis, x.dims());
+
+    for (int i = 0; i < x.dims().size(); i++) {
+      if (i != axis) index_dim.push_back(x.dims().Get()[i]);
+    }
+    DDim index_ddim(index_dim.data(), rank - 1);
+    index.Resize(index_ddim);
+    auto* index_data = ctx.template Alloc<int>(&index);
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)               \
+  ArgMaxFunctor<CPUContext, T, rank> functor##rank; \
+  functor##rank(ctx, *out, &index, axis);
+    switch (out->dims().size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_ENFORCE_LE(
+            out->dims().size(),
+            6,
+            errors::InvalidArgument("gumbel_softmax operator doesn't supports "
+                                    "tensors whose ranks are greater "
+                                    "than 6 in CPU mode."));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+
+    funcs::set_constant(ctx, out, 0.0);
+    for (int i = 0; i < size_to_axis; i++) {
+      for (int j = 0; j < size_out_axis; j++) {
+        *(out->data<T>() + i * size_from_axis + j +
+          index_data[i * size_out_axis + j] * size_out_axis) = 1.0;
+      }
+    }
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gumbel_softmax, CPU, ALL_LAYOUT, phi::GumbelSoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/histogram_kernel.cc b/paddle/phi/kernels/cpu/histogram_kernel.cc
index fbcf47c3070e68470a2eecf3b4c6eaa6c37926d2..82b88f868d8a70cd61073b65bb24fd195baeb5c2 100644
--- a/paddle/phi/kernels/cpu/histogram_kernel.cc
+++ b/paddle/phi/kernels/cpu/histogram_kernel.cc
@@ -77,7 +77,7 @@ void HistogramKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(histogram,
+PD_REGISTER_KERNEL(histogram,
                    CPU,
                    ALL_LAYOUT,
                    phi::HistogramKernel,
diff --git a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..654f2c9400af00484e6921aae63aeb0d93b521ae
--- /dev/null
+++ b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    huber_loss_grad, CPU, ALL_LAYOUT, phi::HuberLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/huber_loss_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..702c0589057af7079e6e0a41f1058063922790fe
--- /dev/null
+++ b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/huber_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    huber_loss, CPU, ALL_LAYOUT, phi::HuberLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/increment_kernel.cc b/paddle/phi/kernels/cpu/increment_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70c178d25a10ab9e65cc4fbbc8f5f3a5176c17ca
--- /dev/null
+++ b/paddle/phi/kernels/cpu/increment_kernel.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/increment_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/increment_kernel_impl.h"
+
+PD_REGISTER_KERNEL(increment,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IncrementKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..74664fb270b2d27a56e7eb6634b50f167b2764ba
--- /dev/null
+++ b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/label_smooth_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LabelSmoothGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           float epsilon,
+                           DenseTensor* label_grad) {
+  ctx.template Alloc<T>(label_grad);
+  auto d_out_dim = out_grad.dims()[out_grad.dims().size() - 1];
+  if (d_out_dim != 0) {
+    auto d_out = EigenVector<T>::Flatten(out_grad);
+    auto d_in = EigenVector<T>::Flatten(*label_grad);
+
+    auto& dev = *ctx.eigen_device();
+    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(label_smooth_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LabelSmoothGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/label_smooth_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c76fb826cdfcce4a37c1d97de0ef37217249a727
--- /dev/null
+++ b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/label_smooth_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LabelSmoothKernel(const Context& ctx,
+                       const DenseTensor& label,
+                       paddle::optional<const DenseTensor&> prior_dist,
+                       float epsilon,
+                       DenseTensor* out) {
+  auto label_dim = label.dims()[label.dims().size() - 1];
+  ctx.template Alloc<T>(out);
+  auto& dev = *ctx.eigen_device();
+  if (label_dim != 0) {
+    auto eigen_out = EigenVector<T>::Flatten(*out);
+    auto eigen_in = EigenVector<T>::Flatten(label);
+    if (prior_dist.is_initialized()) {
+      auto dist = EigenVector<T>::Flatten(*prior_dist.get_ptr());
+      eigen_out.device(dev) =
+          static_cast<T>(1 - epsilon) * eigen_in +
+          static_cast<T>(epsilon) *
+              dist.broadcast(Eigen::DSizes<int, 1>(label.numel() / label_dim));
+    } else {
+      eigen_out.device(dev) = static_cast<T>(1 - epsilon) * eigen_in +
+                              static_cast<T>(epsilon / label_dim);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    label_smooth, CPU, ALL_LAYOUT, phi::LabelSmoothKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
index 7cfb42dbcf96faef7a2b4a4d9f95b8d3a1cb28d6..d74919011ec5da08b700b974393fcc70de22b21c 100644
--- a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
@@ -17,5 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     lerp_grad, CPU, ALL_LAYOUT, phi::LerpGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lerp_kernel.cc b/paddle/phi/kernels/cpu/lerp_kernel.cc
index 97083c96464c305c1ccdb0ff674ce5aac372a335..7adfc35bfa321e8c111a11998e3b0b683009e619 100644
--- a/paddle/phi/kernels/cpu/lerp_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_kernel.cc
@@ -17,4 +17,4 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_kernel_impl.h"
 
-PT_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
+PD_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
index 071bbba1975e40abe65cce3b50972cb282e45c95..7fe41e686af8c54d1d105ffe5ff43c5e9c7a92e8 100644
--- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
@@ -43,7 +43,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select_grad,
+PD_REGISTER_KERNEL(masked_select_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectGradKernel,
diff --git a/paddle/phi/kernels/cpu/masked_select_kernel.cc b/paddle/phi/kernels/cpu/masked_select_kernel.cc
index 08fc3f69f01e17c7e18b0f1307781d9d5290e801..f377658d507f6086101e1cdb0f0ab1891536e771 100644
--- a/paddle/phi/kernels/cpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc
@@ -48,7 +48,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
   DDim out_dim{out_size};
   out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(paddle::platform::CPUPlace());
+  auto out_data = out->mutable_data<T>(phi::CPUPlace());
 
   int index = 0;
   for (int i = 0; i < mask_size; i++) {
@@ -61,7 +61,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select,
+PD_REGISTER_KERNEL(masked_select,
                    CPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectKernel,
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc
index 862ee42296c9244a37a018023d5f3d215b8204e0..5cfcfe62c7816c84a4f2876942b4d9b30dfad167 100644
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
@@ -118,7 +118,7 @@ using complex128 = ::phi::dtype::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::phi::dtype::bfloat16;
-PT_REGISTER_KERNEL(add_raw,
+PD_REGISTER_KERNEL(add_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddRawKernel,
@@ -129,7 +129,7 @@ PT_REGISTER_KERNEL(add_raw,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract_raw,
+PD_REGISTER_KERNEL(subtract_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractRawKernel,
@@ -139,8 +139,9 @@ PT_REGISTER_KERNEL(subtract_raw,
                    int,
                    int64_t,
                    complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(divide_raw,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(divide_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::DivideRawKernel,
@@ -150,7 +151,7 @@ PT_REGISTER_KERNEL(divide_raw,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply_raw,
+PD_REGISTER_KERNEL(multiply_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::MultiplyRawKernel,
@@ -160,8 +161,9 @@ PT_REGISTER_KERNEL(multiply_raw,
                    int64_t,
                    bool,
                    complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(sum_raw,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(sum_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::SumRawKernel,
@@ -176,5 +178,5 @@ PT_REGISTER_KERNEL(sum_raw,
                    complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
diff --git a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
index 56a185e4ade064f91b1e7a52ff48997c7e9941e1..c68e8115e898b3701b9f568ac501260615b69ad4 100644
--- a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul_grad,
+PD_REGISTER_KERNEL(matmul_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulGradKernel,
@@ -28,7 +28,7 @@ PT_REGISTER_KERNEL(matmul_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_double_grad,
+PD_REGISTER_KERNEL(matmul_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulDoubleGradKernel,
@@ -37,7 +37,7 @@ PT_REGISTER_KERNEL(matmul_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_triple_grad,
+PD_REGISTER_KERNEL(matmul_triple_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulTripleGradKernel,
diff --git a/paddle/phi/kernels/cpu/matmul_kernel.cc b/paddle/phi/kernels/cpu/matmul_kernel.cc
index 8676aec3eccb475a9de346e34e15c01c195aebbb..2bf56c07a5bc7485fd29d6ac347a5311915d8f36 100644
--- a/paddle/phi/kernels/cpu/matmul_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_kernel.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul,
+PD_REGISTER_KERNEL(matmul,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulKernel,
diff --git a/paddle/phi/kernels/cpu/multinomial_kernel.cc b/paddle/phi/kernels/cpu/multinomial_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9c2a569e0650dececa4541b3fdc7eba9b3f022e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multinomial_kernel.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/multinomial_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/multinomial_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultinomialKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int num_samples,
+                       bool replacement,
+                       DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(out);
+  auto in_dims = x.dims();
+  int64_t in_rank = in_dims.size();
+  const int64_t num_categories = in_dims[in_rank - 1];
+  const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
+
+  funcs::MultinomialFunctor<T>(dev_ctx,
+                               out_data,
+                               in_data,
+                               num_samples,
+                               replacement,
+                               num_categories,
+                               num_distributions);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    multinomial, CPU, ALL_LAYOUT, phi::MultinomialKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/mv_grad_kernel.cc b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3b7f94be41948267ed486a5109ffcc2d6db99fb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mv_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MvGradKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& vec,
+                  const DenseTensor& out_grad,
+                  DenseTensor* x_grad,
+                  DenseTensor* vec_grad) {
+  auto dout = out_grad;
+  auto dx = x_grad;
+  auto dvec = vec_grad;
+
+  auto dim_x = x.dims();
+  int m = dim_x[0];
+  int n = dim_x[1];
+
+  // get data ptr
+  const T* x_data = x.data<T>();
+  const T* vec_data = vec.data<T>();
+  const T* dout_data = dout.data<T>();
+
+  if (dx) {
+    T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+    for (int i = 0; i < m; ++i) {
+      for (int j = 0; j < n; ++j) {
+        dx_data[i * n + j] = dout_data[i] * vec_data[j];
+      }
+    }
+  }
+
+  if (dvec) {
+    T* dvec_data = dev_ctx.template Alloc<T>(dvec);
+
+    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+    blas.GEMV(true,
+              dim_x[0],
+              dim_x[1],
+              static_cast<T>(1),
+              x_data,
+              dout_data,
+              static_cast<T>(0),
+              dvec_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mv_grad, CPU, ALL_LAYOUT, phi::MvGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/mv_kernel.cc b/paddle/phi/kernels/cpu/mv_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f76ddda6dde5ba686fa7403910a245644a16f2d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mv_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mv_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/impl/mv_kernel_impl.h"
+
+PD_REGISTER_KERNEL(mv, CPU, ALL_LAYOUT, phi::MvKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
index d2073c07244bd54acbfcf7bf81028684f3ea739b..597207a05a226ac598d9141b42d5682bed5364f1 100644
--- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
@@ -83,5 +83,5 @@ void NormGradKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     norm_grad, CPU, ALL_LAYOUT, phi::NormGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/norm_kernel.cc b/paddle/phi/kernels/cpu/norm_kernel.cc
index e8f35b5fe7efd8dc04f16dffa877af082456a14d..50906d9c3bb9495817e81678b60fe3e426a22444 100644
--- a/paddle/phi/kernels/cpu/norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_kernel.cc
@@ -76,4 +76,4 @@ void NormKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(norm, CPU, ALL_LAYOUT, phi::NormKernel, float, double) {}
+PD_REGISTER_KERNEL(norm, CPU, ALL_LAYOUT, phi::NormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b32065d4f0a145c382648cc1f192b032f7df0802
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pixel_shuffle_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PixelShuffleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80f8fa7b50efb7f2e685b7e202d89c0f9a382a18
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pixel_shuffle_kernel.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    pixel_shuffle, CPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/poisson_grad_kernel.cc b/paddle/phi/kernels/cpu/poisson_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e274a7af9ff30428b71dec8367deca71dbb4fe5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/poisson_grad_kernel.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/poisson_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    poisson_grad, CPU, ALL_LAYOUT, phi::PoissonGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/poisson_kernel.cc b/paddle/phi/kernels/cpu/poisson_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a3e32c2f07853f57e123e64660cd6bc50d8574b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/poisson_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <random>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/poisson_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  const T* x_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  int64_t size = x.numel();
+
+  auto gen = ctx.GetGenerator();
+  auto engine = gen->GetCPUEngine();
+
+  for (int64_t i = 0; i < size; ++i) {
+    std::poisson_distribution<> dist(x_data[i]);
+    out_data[i] = static_cast<T>(dist(*engine));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    poisson, CPU, ALL_LAYOUT, phi::PoissonKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/randint_kernel.cc b/paddle/phi/kernels/cpu/randint_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..feb418949ba40d3bf553c2df0b4300cc686a0ef7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/randint_kernel.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/randint_kernel.h"
+
+#include <random>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandintRawKernel(const Context& dev_ctx,
+                      int low,
+                      int high,
+                      const ScalarArray& shape,
+                      DataType dtype,
+                      int seed,
+                      DenseTensor* out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  T* data = dev_ctx.template Alloc<T>(out);
+  auto numel = out->numel();
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
+  }
+  std::uniform_int_distribution<T> dist(low, high - 1);
+  for (int64_t i = 0; i < numel; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <typename T, typename Context>
+void RandintKernel(const Context& dev_ctx,
+                   int low,
+                   int high,
+                   const ScalarArray& shape,
+                   DataType dtype,
+                   DenseTensor* out) {
+  RandintRawKernel<T>(dev_ctx, low, high, shape, dtype, 0, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    randint_raw, CPU, ALL_LAYOUT, phi::RandintRawKernel, int, int64_t) {}
+
+PD_REGISTER_KERNEL(randint, CPU, ALL_LAYOUT, phi::RandintKernel, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/randperm_kernel.cc b/paddle/phi/kernels/cpu/randperm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6cb435f53b85bd22afba1a0d31b16ecd4c27204b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/randperm_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/randperm_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandpermRawKernel(
+    const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
+  }
+
+  for (int i = 0; i < n; ++i) {
+    out_data[i] = static_cast<T>(i);
+  }
+  std::shuffle(out_data, out_data + n, *engine);
+}
+
+template <typename T, typename Context>
+void RandpermKernel(const Context& dev_ctx,
+                    int n,
+                    DataType dtype,
+                    DenseTensor* out) {
+  RandpermRawKernel<T>(dev_ctx, n, dtype, 0, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(randperm_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RandpermRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(randperm,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RandpermKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/scale_kernel.cc b/paddle/phi/kernels/cpu/scale_kernel.cc
index 156afb8798de40000dcdea7d613734b92f1bc162..e929b5bd7219b60acb226374f67a0bc511c41723 100644
--- a/paddle/phi/kernels/cpu/scale_kernel.cc
+++ b/paddle/phi/kernels/cpu/scale_kernel.cc
@@ -51,7 +51,7 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale,
+PD_REGISTER_KERNEL(scale,
                    CPU,
                    ALL_LAYOUT,
                    phi::ScaleKernel,
diff --git a/paddle/phi/kernels/cpu/sign_kernel.cc b/paddle/phi/kernels/cpu/sign_kernel.cc
index 6be931904d133159b907d296d17aebdba9bc2501..5fe11ffbd6d5c08b5072b61ab23d6fbea1879b53 100644
--- a/paddle/phi/kernels/cpu/sign_kernel.cc
+++ b/paddle/phi/kernels/cpu/sign_kernel.cc
@@ -21,4 +21,4 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/bfloat16.h"
 
-PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, phi::SignKernel, float, double) {}
+PD_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, phi::SignKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff34ef26f6bd3aea13815cb347719f054fd0a058
--- /dev/null
+++ b/paddle/phi/kernels/cpu/size_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/size_kernel.h"
+#include "paddle/phi/kernels/impl/size_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(size,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SizeKernel,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/softmax_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ef90f9c6762d680a00e8841a20ff4ddcd5abe28a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/softmax_grad_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    softmax_grad, CPU, ALL_LAYOUT, phi::SoftmaxGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/softmax_kernel.cc b/paddle/phi/kernels/cpu/softmax_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..537b4326681a175fbad7593eed1d8b6caee9d86c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/softmax_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    softmax, CPU, ALL_LAYOUT, phi::SoftmaxRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index d02909f007da462089903d0f0764e2cf86231ede..722681fb7bc3f9d9f75b92468b89931910dd532e 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 #include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/cpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace phi {
 
 template <typename T, typename Context>
@@ -54,13 +54,14 @@ void SplitKernel(const Context& dev_ctx,
     paddle::operators::StridedMemcpyWithAxis0<T>(
         dev_ctx, x, shape_refer, &outs);
   } else {
-    SplitImpl<T, Context>(dev_ctx, x, shape_refer, axis, &outs);
+    phi::funcs::SplitFunctor<Context, T> functor;
+    functor(dev_ctx, x, shape_refer, axis, &outs);
   }
 }
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(split,
+PD_REGISTER_KERNEL(split,
                    CPU,
                    ALL_LAYOUT,
                    phi::SplitKernel,
@@ -69,4 +70,5 @@ PT_REGISTER_KERNEL(split,
                    int64_t,
                    int,
                    bool,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/trace_grad_kernel.cc b/paddle/phi/kernels/cpu/trace_grad_kernel.cc
index e6ffd99bc53bd837aa3ef5ea142890fd4786249d..2167851b197d142a3e9c4b104175fd9147de6972 100644
--- a/paddle/phi/kernels/cpu/trace_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/trace_grad_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/trace_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(trace_grad,
+PD_REGISTER_KERNEL(trace_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::TraceGradKernel,
diff --git a/paddle/phi/kernels/cpu/trace_kernel.cc b/paddle/phi/kernels/cpu/trace_kernel.cc
index 2b2cda6491d48487834321b376920f8943ea3650..3646e226519139430818c0f17b3f40c61c516dbd 100644
--- a/paddle/phi/kernels/cpu/trace_kernel.cc
+++ b/paddle/phi/kernels/cpu/trace_kernel.cc
@@ -45,7 +45,7 @@ void TraceKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trace,
+PD_REGISTER_KERNEL(trace,
                    CPU,
                    ALL_LAYOUT,
                    phi::TraceKernel,
diff --git a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
index 7fc677c16ef7397e0963bbd1c9eed3ac49f136e0..4d85dd609e2d1f14cc476a1c53ba0506e6b519a5 100644
--- a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
@@ -30,7 +30,7 @@ void TruncGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trunc_grad,
+PD_REGISTER_KERNEL(trunc_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::TruncGradKernel,
diff --git a/paddle/phi/kernels/cpu/trunc_kernel.cc b/paddle/phi/kernels/cpu/trunc_kernel.cc
index 10e42196679fa546f7611b97fbcda812bedf4b23..babae6ce7c9318f7cb4ba1f15aedbe38de5ebbd3 100644
--- a/paddle/phi/kernels/cpu/trunc_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_kernel.cc
@@ -35,5 +35,5 @@ void TruncKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     trunc, CPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ebc032ef54538188d8e287673c0d31fae9ad197b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
+
+#include <limits>
+#include <random>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const ScalarArray& shape,
+                                   float mean,
+                                   float std,
+                                   int seed,
+                                   DataType dtype,
+                                   DenseTensor* out) {
+  auto tensor = out;
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                         1.0);
+  TruncatedNormal<T> truncated_normal(mean, std);
+  int64_t size = tensor->numel();
+
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = truncated_normal(dist(*engine));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(truncated_gaussian_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TruncatedGaussianRandomKernel,
+                   float) {}
diff --git a/paddle/phi/kernels/cpu/unbind_kernel.cc b/paddle/phi/kernels/cpu/unbind_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..39cc2f8fc4662a0893fb8b73b138a52b810f59b8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unbind_kernel.cc
@@ -0,0 +1,29 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unbind_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unbind,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnbindKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/unfold_grad_kernel.cc b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c97005dd84547eeb04603da6dc29b922715b936a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unfold_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    unfold_grad, CPU, ALL_LAYOUT, phi::UnfoldGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/unfold_kernel.cc b/paddle/phi/kernels/cpu/unfold_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e38d8acd098204e82245ab697967b8c209bfb0e6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unfold_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unfold_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unfold_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unfold, CPU, ALL_LAYOUT, phi::UnfoldKernel, float, double) {}
diff --git a/paddle/phi/kernels/cross_grad_kernel.h b/paddle/phi/kernels/cross_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ea0804a94b6b5d145a13c8f794a9f01498bf7db
--- /dev/null
+++ b/paddle/phi/kernels/cross_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CrossGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     int axis,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cross_kernel.h b/paddle/phi/kernels/cross_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..567889e0783452bf17630a074528cfbf3658ec38
--- /dev/null
+++ b/paddle/phi/kernels/cross_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CrossKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 int axis,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/diag_kernel.h b/paddle/phi/kernels/diag_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8dc919fa633606ce30657cb20a59fbf615e3e15a
--- /dev/null
+++ b/paddle/phi/kernels/diag_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int offset,
+                float padding_value,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/diagonal_kernel.h b/paddle/phi/kernels/diagonal_kernel.h
index f233ba2a956276ad22819d49c30fbcbaf8a671c3..7cf7282307a4b91a771441d3218121b606afdf81 100644
--- a/paddle/phi/kernels/diagonal_kernel.h
+++ b/paddle/phi/kernels/diagonal_kernel.h
@@ -25,4 +25,4 @@ void DiagonalKernel(const Context& dev_ctx,
                     int axis1,
                     int axis2,
                     DenseTensor* out);
-}  // pten
+}  // phi
diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h
index b95d98895aa8edda497a730281603028b98bc4f0..38912a5ccc442b6ea5fb484b708754dd706ae706 100644
--- a/paddle/phi/kernels/digamma_grad_kernel.h
+++ b/paddle/phi/kernels/digamma_grad_kernel.h
@@ -24,4 +24,4 @@ void DigammaGradKernel(const Context& ctx,
                        const DenseTensor& x,
                        DenseTensor* x_grad);
 
-}  // namepsace pten
+}  // namepsace phi
diff --git a/paddle/phi/kernels/digamma_kernel.h b/paddle/phi/kernels/digamma_kernel.h
index 1772a33e4ee4cd88d80705971462b632c1015c3c..ce25f2e148e963054fcfa2a51321954b45a4297b 100644
--- a/paddle/phi/kernels/digamma_kernel.h
+++ b/paddle/phi/kernels/digamma_kernel.h
@@ -21,4 +21,4 @@ namespace phi {
 template <typename T, typename Context>
 void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
 
-}  // namepsace pten
+}  // namepsace phi
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 6d9e733b2f57677c70e259f39d20c332a5fff195..6e5f15fe1692b473965f96f68fd86fad87f1892e 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -25,7 +25,8 @@ void EmptyKernel(const Context& dev_ctx,
                  const ScalarArray& shape,
                  DataType dtype,
                  DenseTensor* out) {
-  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
+  out->Resize(phi::make_ddim(shape.GetData()));
+  dev_ctx.template Alloc<T>(out);
 }
 
 template <typename T, typename Context>
@@ -38,7 +39,7 @@ void EmptyLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(empty,
+PD_REGISTER_KERNEL(empty,
                    CPU,
                    ALL_LAYOUT,
                    phi::EmptyKernel,
@@ -54,7 +55,7 @@ PT_REGISTER_KERNEL(empty,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(empty_like,
+PD_REGISTER_KERNEL(empty_like,
                    CPU,
                    ALL_LAYOUT,
                    phi::EmptyLikeKernel,
@@ -68,10 +69,12 @@ PT_REGISTER_KERNEL(empty_like,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(empty,
+PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
                    phi::EmptyKernel,
@@ -86,7 +89,7 @@ PT_REGISTER_KERNEL(empty,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(empty_like,
+PD_REGISTER_KERNEL(empty_like,
                    GPU,
                    ALL_LAYOUT,
                    phi::EmptyLikeKernel,
@@ -100,5 +103,7 @@ PT_REGISTER_KERNEL(empty_like,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 #endif
diff --git a/paddle/phi/kernels/empty_kernel.h b/paddle/phi/kernels/empty_kernel.h
index 54ba8b16c1d7409915f11411e99abaac03586aec..0b8d95ee94fb5480684023ec6c71698ba06d9c13 100644
--- a/paddle/phi/kernels/empty_kernel.h
+++ b/paddle/phi/kernels/empty_kernel.h
@@ -54,22 +54,20 @@ DenseTensor Empty(const Context& dev_ctx) {
 }
 
 template <typename T, typename Context>
-DenseTensor Empty(const Context& dev_ctx,
-                  const ScalarArray& shape,
-                  DataType dtype = DataType::FLOAT32) {
+DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
   auto dense_out = Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
+  DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
   EmptyKernel<T, Context>(dev_ctx, shape, dtype, &dense_out);
   return dense_out;
 }
 
 template <typename T, typename Context>
-DenseTensor EmptyLike(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      DataType dtype = DataType::UNDEFINED) {
+DenseTensor EmptyLike(const Context& dev_ctx, const DenseTensor& x) {
   auto dense_out = Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
+  DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
   EmptyLikeKernel<T, Context>(dev_ctx, x, dtype, &dense_out);
   return dense_out;
diff --git a/paddle/phi/kernels/erfinv_grad_kernel.h b/paddle/phi/kernels/erfinv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..67e70ad38caf4f74864500757b1f733188dbbc86
--- /dev/null
+++ b/paddle/phi/kernels/erfinv_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfinvGradKernel(const Context& ctx,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/erfinv_kernel.h b/paddle/phi/kernels/erfinv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8380a62971ba452ed86ad9d993690c8e42afdd53
--- /dev/null
+++ b/paddle/phi/kernels/erfinv_kernel.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/expand_kernel.h b/paddle/phi/kernels/expand_kernel.h
index eb32ed24568599b2966f1f7772b8e9f6e710063b..fb5a0112ffcf7120314471db3c30b0e72a2b9c81 100644
--- a/paddle/phi/kernels/expand_kernel.h
+++ b/paddle/phi/kernels/expand_kernel.h
@@ -26,4 +26,4 @@ void ExpandKernel(const Context& ctx,
                   const ScalarArray& shape,
                   DenseTensor* out);
 
-}  // namepsace pten
+}  // namepsace phi
diff --git a/paddle/phi/kernels/eye_kernel.h b/paddle/phi/kernels/eye_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b21b8ae40562c979b23e4292a7591d9c6f10cf7
--- /dev/null
+++ b/paddle/phi/kernels/eye_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EyeKernel(const Context& ctx,
+               int64_t num_rows,
+               int64_t num_columns,
+               int dtype,
+               DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 33e6c2724982a7c916636d2f782898eedf875225..f6ba2725004fe799f46ceebc26208f8adfda5047 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -26,13 +26,13 @@ void FlattenGradKernel(const Context& dev_ctx,
                        DenseTensor* x_grad) {
   auto xshape_dims = xshape.dims();
   auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-  phi::Copy(dev_ctx, out_grad, false, x_grad);
-  x_grad->ResizeAndAllocate(x_dims);
+  phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+  x_grad->Resize(x_dims);
 }
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(flatten_grad,
+PD_REGISTER_KERNEL(flatten_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
@@ -44,7 +44,7 @@ PT_REGISTER_KERNEL(flatten_grad,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(flatten_grad,
+PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
@@ -59,7 +59,7 @@ PT_REGISTER_KERNEL(flatten_grad,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_KERNEL(flatten_grad,
+PD_REGISTER_KERNEL(flatten_grad,
                    XPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index 1ac444aa1792f4645c44feb117a5eacc409b0017..78ac9eaa785cd20c6087586892a9503ca4e24040 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -28,8 +28,8 @@ void FlattenKernel(const Context& dev_ctx,
                    int stop_axis,
                    DenseTensor* out) {
   auto out_dims = out->dims();
-  phi::Copy(dev_ctx, x, false, out);
-  out->ResizeAndAllocate(out_dims);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  out->Resize(out_dims);
 }
 
 // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
@@ -48,7 +48,7 @@ void FlattenWithXShape(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(flatten,
+PD_REGISTER_KERNEL(flatten,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
@@ -60,7 +60,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(flatten_with_xshape,
+PD_REGISTER_KERNEL(flatten_with_xshape,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
@@ -73,7 +73,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(flatten,
+PD_REGISTER_KERNEL(flatten,
                    GPU,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
@@ -86,7 +86,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(flatten_with_xshape,
+PD_REGISTER_KERNEL(flatten_with_xshape,
                    GPU,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
@@ -101,7 +101,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_KERNEL(flatten,
+PD_REGISTER_KERNEL(flatten,
                    XPU,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
@@ -112,7 +112,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(flatten_with_xshape,
+PD_REGISTER_KERNEL(flatten_with_xshape,
                    XPU,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
diff --git a/paddle/phi/kernels/flip_kernel.h b/paddle/phi/kernels/flip_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4470486fec0fb6ba1e176d9696bf43b559b62485
--- /dev/null
+++ b/paddle/phi/kernels/flip_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FlipKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int>& axis,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index 394aab8f96e1ad1e8f2fb53ee4a163e7ec874226..c7b1f9af0e3191ec217d2907677ff34edebc551b 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -48,10 +48,10 @@ void FullLikeKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 DenseTensor Full(const Context& dev_ctx,
                  const ScalarArray& shape,
-                 const Scalar& val,
-                 DataType dtype = DataType::FLOAT32) {
+                 const Scalar& val) {
   auto dense_out = Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
+  DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
   FullKernel<T, Context>(dev_ctx, shape, val, dtype, &dense_out);
   return dense_out;
@@ -60,10 +60,10 @@ DenseTensor Full(const Context& dev_ctx,
 template <typename T, typename Context>
 DenseTensor FullLike(const Context& dev_ctx,
                      const DenseTensor& x,
-                     const Scalar& val,
-                     DataType dtype = DataType::UNDEFINED) {
+                     const Scalar& val) {
   auto dense_out = Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
+  DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
   FullLikeKernel<T, Context>(dev_ctx, x, val, dtype, &dense_out);
   return dense_out;
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index ba0c848df434ed403c29a5754043784066f7ef2a..aa4fac169200753639c48f5e9b5fa8c3bbfbd33c 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory(blas)
 add_subdirectory(lapack)
 
 math_library(math_function DEPS blas dense_tensor tensor)
+math_library(concat_and_split_functor DEPS dense_tensor)
diff --git a/paddle/phi/kernels/funcs/axis_utils.h b/paddle/phi/kernels/funcs/axis_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..02a89471889a7abdda0e9856bf8c8d006895910d
--- /dev/null
+++ b/paddle/phi/kernels/funcs/axis_utils.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeOutAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index 4d7700a89d27bb66e741b1e38207d5bd3a797658..2868aa5acb75e37110f02cf30e761625a3cc8ff7 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -76,6 +76,36 @@ struct CBlas<phi::dtype::bfloat16> {
         "Blas VCOPY do not supported on CPU with bfloat16,"
         " please check your code"));
   }
+
+  template <typename... ARGS>
+  static void VADD(int n,
+                   const phi::dtype::bfloat16 *x,
+                   const phi::dtype::bfloat16 *y,
+                   phi::dtype::bfloat16 *z) {
+    for (int i = 0; i < n; ++i) {
+      z[i] = x[i] + y[i];
+    }
+  }
+
+  template <typename... ARGS>
+  static void VMUL(int n,
+                   const phi::dtype::bfloat16 *x,
+                   const phi::dtype::bfloat16 *y,
+                   phi::dtype::bfloat16 *z) {
+    for (int i = 0; i < n; ++i) {
+      z[i] = x[i] * y[i];
+    }
+  }
+
+  template <typename... ARGS>
+  static void VSUB(int n,
+                   const phi::dtype::bfloat16 *x,
+                   const phi::dtype::bfloat16 *y,
+                   phi::dtype::bfloat16 *z) {
+    for (int i = 0; i < n; ++i) {
+      z[i] = x[i] - y[i];
+    }
+  }
 };
 
 #ifdef PADDLE_WITH_MKLML
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index be57b8630f89578e8de48f6dc581cb6fc37a1048..84a36b849afa1c4cdcc1a0f4d4ada598944a1faa 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 namespace kps = phi::kps;
 
@@ -122,7 +122,7 @@ struct DimensionsTransform {
   explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
                                const phi::DDim &dims,
                                int axis) {
-    const int N = max(static_cast<int>(ins.size()), 2);
+    const int N = std::max(static_cast<int>(ins.size()), 2);
     dim_size = dims.size();
     out_dims = phi::vectorize<int64_t>(dims);
     in_dims.resize(N);
@@ -183,7 +183,7 @@ struct DimensionsTransform {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 template <typename T, int VecSize, int Rank, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
@@ -268,7 +268,7 @@ __global__ void VectorizedBroadcastKernel(
   int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
   int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
   for (; block_offset < main_offset; block_offset += stride) {
     VectorizedBroadcastKernelImpl<InT,
                                   OutT,
@@ -348,12 +348,12 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
   phi::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
+    outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
   }
 
   for (int i = 0; i < Arity; i++) {
     use_broadcast[i] = (ins[i]->numel() != numel);
-    ins_data[i] = (_ptr_ InT *)(ins[i]->data<InT>());
+    ins_data[i] = (const _ptr_ InT *)(ins[i]->data<InT>());
     if (use_broadcast[i]) {
       // get the broadcast config,
       // if data shape is[m, n], then you should set data_dim = {n, m}
@@ -363,7 +363,7 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
     }
   }
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
   const int threads = 64;
   const int blocks = 8;
   int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index e14241d03c3af09bd1d0201da0f53ffadd2b2c4a..d5289dcc22cbc546acc4980403e7e4641abe39f1 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -42,12 +42,12 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
@@ -72,7 +72,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
         x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
             y_dims_array[i] <= 1,
         true,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Broadcast dimension mismatch. Operands could "
             "not be broadcast together with the shape of X = [%s] and "
             "the shape of Y = [%s]. Received [%d] in X is not equal to "
@@ -128,5 +128,17 @@ static void GetBroadcastDims(const DDim &in_dims,
   }
 }
 
+inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) {
+  if (dims_x.size() != dims_y.size()) {
+    return false;
+  }
+  for (int i = 0; i < dims_x.size(); i++) {
+    if (dims_x[i] != dims_y[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h
index 450adfcc68b7e84e27a2f6bf2c6c22551bab8892..86dbdd099ecde72e932cc6cfa492486b65c7ebc2 100644
--- a/paddle/phi/kernels/funcs/complex_functors.h
+++ b/paddle/phi/kernels/funcs/complex_functors.h
@@ -154,6 +154,53 @@ struct AbsFunctor<T, NoComplex<T, Real<T>>> {
   int64_t numel_;
 };
 
+template <typename T>
+struct AbsGradCUDAFunctor {
+  HOSTDEVICE inline AbsGradCUDAFunctor() {}
+
+  HOSTDEVICE inline T operator()(const T x, const T dout) const {
+    T output;
+    if (x == T(0)) {
+      output = T(0);
+    } else {
+      output = T(dout) * (x / T(std::abs(x)));
+    }
+    return output;
+  }
+};
+
+template <>
+struct AbsGradCUDAFunctor<phi::dtype::complex<float>> {
+  HOSTDEVICE inline AbsGradCUDAFunctor() {}
+  HOSTDEVICE inline phi::dtype::complex<float> operator()(
+      const phi::dtype::complex<float> x, const float dout) const {
+    phi::dtype::complex<float> output;
+    if (x == phi::dtype::complex<float>(0)) {
+      output = phi::dtype::complex<float>(0);
+    } else {
+      output = phi::dtype::complex<float>(dout) *
+               (x / phi::dtype::complex<float>(abs(x)));
+    }
+    return output;
+  }
+};
+
+template <>
+struct AbsGradCUDAFunctor<phi::dtype::complex<double>> {
+  HOSTDEVICE inline AbsGradCUDAFunctor() {}
+  HOSTDEVICE inline phi::dtype::complex<double> operator()(
+      const phi::dtype::complex<double> x, const double dout) const {
+    phi::dtype::complex<double> output;
+    if (x == phi::dtype::complex<double>(0)) {
+      output = phi::dtype::complex<double>(0);
+    } else {
+      output = phi::dtype::complex<double>(dout) *
+               (x / phi::dtype::complex<double>(abs(x)));
+    }
+    return output;
+  }
+};
+
 template <typename T>
 struct AbsGradFunctor {
   AbsGradFunctor(const Real<T>* dout, const T* x, T* output, int64_t numel)
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cc b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa73ba5f689906e73f3f0e3a845aa397ad0a33c1
--- /dev/null
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+
+namespace phi {
+namespace funcs {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+struct ConcatFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
+                  const std::vector<phi::DenseTensor>& input,
+                  int axis,
+                  phi::DenseTensor* output) {
+    // TODO(zcd): Add input data validity checking
+    size_t num = input.size();
+
+    int64_t rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int64_t out_rows = rows, out_cols = 0;
+
+    std::vector<int64_t> input_cols(input.size());
+    for (size_t i = 0; i < num; ++i) {
+      int64_t t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+    auto cpu_place = context.GetPlace();
+
+    // computation
+    auto output_data = output->data<T>();
+    int64_t col_idx = 0;
+    for (size_t j = 0; j < num; ++j) {
+      int64_t col_len = input_cols[j];
+      auto input_data = input[j].data<T>();
+      for (int64_t k = 0; k < out_rows; ++k) {
+        paddle::memory::Copy(cpu_place,
+                             output_data + k * out_cols + col_idx,
+                             cpu_place,
+                             input_data + k * col_len,
+                             sizeof(T) * col_len);
+      }
+      col_idx += col_len;
+    }
+  }
+};
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+struct SplitFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs) {
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+    // tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+
+    // TODO(zcd): Add input data validity checking
+    size_t num = outputs->size();
+
+    int input_rows = 1;
+    auto dim_0 = ref_inputs[0]->dims();
+    for (int i = 0; i < axis; ++i) {
+      input_rows *= dim_0[i];
+    }
+
+    int input_cols = 0;
+
+    std::vector<int64_t> output_cols(outputs->size());
+    for (size_t i = 0; i < num; ++i) {
+      int t_cols = ref_inputs[i]->numel() / input_rows;
+      input_cols += t_cols;
+      output_cols[i] = t_cols;
+    }
+    auto cpu_place = context.GetPlace();
+
+    // computation
+    for (int k = 0; k < input_rows; ++k) {
+      const T* src_ptr = input.data<T>() + k * input_cols;
+      int col_idx = 0;
+      for (size_t j = 0; j < num; ++j) {
+        int col_len = output_cols[j];
+        auto* out_tensor = outputs->at(j);
+        if (out_tensor != nullptr) {
+          T* dst_ptr = out_tensor->data<T>() + k * col_len;
+          paddle::memory::Copy(cpu_place,
+                               dst_ptr,
+                               cpu_place,
+                               src_ptr + col_idx,
+                               sizeof(T) * col_len);
+        }
+        col_idx += col_len;
+      }
+    }
+  }
+};
+
+#define DEFINE_FUNCTOR(type)                           \
+  template class ConcatFunctor<phi::CPUContext, type>; \
+  template class SplitFunctor<phi::CPUContext, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..840c8872f50f83c2859f07be2e0e7242a74004a7
--- /dev/null
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -0,0 +1,572 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+__global__ void ConcatKernel_(const T** inputs,
+                              const int64_t* input_cols,
+                              int col_size,
+                              const int64_t output_rows,
+                              const int64_t output_cols,
+                              T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = input_cols[0];
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = input_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = input_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+
+    const T* input_ptr = inputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * segment_width + local_col];
+  }
+}
+
+template <typename T>
+__device__ void ConcatKernelDetail(const T** inputs_data,
+                                   const int fixed_in_col,
+                                   const int out_rows,
+                                   const int out_cols,
+                                   T* output_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * 1.0 / fixed_in_col;
+    int in_offset = tid_x - split * fixed_in_col;
+    const T* input_ptr = inputs_data[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
+      output_data[tid_y * out_cols + tid_x] =
+          input_ptr[tid_y * fixed_in_col + in_offset];
+    }
+  }
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T* input_addr0,
+                              const T* input_addr1,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  const T* inputs_data[2];
+  inputs_data[0] = input_addr0;
+  inputs_data[1] = input_addr1;
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T* input_addr0,
+                              const T* input_addr1,
+                              const T* input_addr2,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  const T* inputs_data[3];
+  inputs_data[0] = input_addr0;
+  inputs_data[1] = input_addr1;
+  inputs_data[2] = input_addr2;
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T* input_addr0,
+                              const T* input_addr1,
+                              const T* input_addr2,
+                              const T* input_addr3,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  const T* inputs_data[4];
+  inputs_data[0] = input_addr0;
+  inputs_data[1] = input_addr1;
+  inputs_data[2] = input_addr2;
+  inputs_data[3] = input_addr3;
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T** inputs_data,
+                              const int in_num,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t* out_cols,
+                             int out_cols_size,
+                             T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs_data[curr_segment];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__device__ void SplitKernelDetail(const T* input_data,
+                                  const int in_row,
+                                  const int in_col,
+                                  const int fixed_out_col,
+                                  T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t fixed_out_col,
+                             T** outputs_data) {
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t fixed_out_col,
+                             T* outputs_addr0,
+                             T* outputs_addr1) {
+  T* outputs_data[2];
+  outputs_data[0] = outputs_addr0;
+  outputs_data[1] = outputs_addr1;
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t fixed_out_col,
+                             T* outputs_addr0,
+                             T* outputs_addr1,
+                             T* outputs_addr2) {
+  T* outputs_data[3];
+  outputs_data[0] = outputs_addr0;
+  outputs_data[1] = outputs_addr1;
+  outputs_data[2] = outputs_addr2;
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t fixed_out_col,
+                             T* outputs_addr0,
+                             T* outputs_addr1,
+                             T* outputs_addr2,
+                             T* outputs_addr3) {
+  T* outputs_data[4];
+  outputs_data[0] = outputs_addr0;
+  outputs_data[1] = outputs_addr1;
+  outputs_data[2] = outputs_addr2;
+  outputs_data[3] = outputs_addr3;
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+static inline void GetBlockDims(const phi::GPUContext& context,
+                                int64_t num_rows,
+                                int64_t num_cols,
+                                dim3* block_dims,
+                                dim3* grid_dims) {
+  // Set the thread block and grid according to CurrentDeviceId
+  const int kThreadsPerBlock = 1024;
+  int block_cols = kThreadsPerBlock;
+  if (num_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    block_cols = ((num_cols + 31) >> 5) << 5;
+  }
+  int block_rows = kThreadsPerBlock / block_cols;
+  *block_dims = dim3(block_cols, block_rows, 1);
+
+  int max_threads = context.GetMaxPhysicalThreadCount();
+  int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+  int grid_cols =
+      std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
+  int grid_rows = std::min(max_blocks / grid_cols,
+                           std::max(num_rows / block_rows, (int64_t)1));
+  *grid_dims = dim3(grid_cols, grid_rows, 1);
+}
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+
+template <typename T>
+struct ConcatFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
+                  const std::vector<phi::DenseTensor>& input,
+                  int axis,
+                  phi::DenseTensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int in_num = input.size();
+    int64_t in_row = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      in_row *= dim_0[i];
+    }
+    int64_t in_col = input[0].numel() / in_row;
+    int64_t out_row = in_row, out_col = 0;
+
+    int inputs_col_num = in_num + 1;
+    std::vector<const T*> inputs_data_vec(in_num);
+    std::vector<int64_t> inputs_col_vec(inputs_col_num);
+    const T** inputs_data = inputs_data_vec.data();
+    int64_t* inputs_col = inputs_col_vec.data();
+
+// There are some differences between hip runtime and NV runtime.
+// In NV, when the pageable memory data less than 64K is transferred from
+// hosttodevice, it will be automatically asynchronous.
+// However, only pinned memory in hip can copy asynchronously
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
+// 3.2.6.1. Concurrent Execution between Host and Device
+// Memory copies from host to device of a memory block of 64 KB or less
+#ifdef PADDLE_WITH_HIP
+    paddle::memory::AllocationPtr data_alloc, col_alloc;
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
+    data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                       in_num * sizeof(T*));
+    inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
+    col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                      inputs_col_num * sizeof(int));
+    inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
+#endif
+
+    inputs_col[0] = 0;
+    bool has_same_shape = true;
+    for (int i = 0; i < in_num; ++i) {
+      int64_t t_cols = input[i].numel() / in_row;
+      if (has_same_shape) {
+        if (t_cols != in_col) has_same_shape = false;
+      }
+      out_col += t_cols;
+      inputs_col[i + 1] = out_col;
+      inputs_data[i] = input[i].data<T>();
+    }
+
+    dim3 block_dims;
+    dim3 grid_dims;
+    GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
+
+    paddle::memory::allocation::AllocationPtr tmp_dev_ins_data;
+    const T** dev_ins_data = nullptr;
+    if (!has_same_shape || in_num < 2 || in_num > 4) {
+      tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*));
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          inputs_data, in_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_ins_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           in_num * sizeof(T*),
+                           context.stream());
+      dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
+    }
+
+    if (has_same_shape) {
+      if (in_num == 2) {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            inputs_data[0],
+            inputs_data[1],
+            in_col,
+            out_row,
+            out_col,
+            output->data<T>());
+      } else if (in_num == 3) {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            inputs_data[0],
+            inputs_data[1],
+            inputs_data[2],
+            in_col,
+            out_row,
+            out_col,
+            output->data<T>());
+      } else if (in_num == 4) {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            inputs_data[0],
+            inputs_data[1],
+            inputs_data[2],
+            inputs_data[3],
+            in_col,
+            out_row,
+            out_col,
+            output->data<T>());
+      } else {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
+      }
+    } else {
+      auto tmp_dev_ins_col_data =
+          paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t));
+
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          inputs_col, inputs_col_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_ins_col_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           inputs_col_num * sizeof(int64_t),
+                           context.stream());
+      int64_t* dev_ins_col_data =
+          static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
+
+      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+          dev_ins_data,
+          dev_ins_col_data,
+          static_cast<int>(inputs_col_num),
+          out_row,
+          out_col,
+          output->data<T>());
+    }
+
+#ifdef PADDLE_WITH_HIP
+    // Prevent the pinned memory value from being covered and release the memory
+    // after the launch kernel of the stream is executed (reapply pinned memory
+    // next time)
+    auto* data_alloc_released = data_alloc.release();
+    auto* col_alloc_released = col_alloc.release();
+    context.AddStreamCallback([data_alloc_released, col_alloc_released] {
+      paddle::memory::allocation::Allocator::AllocationDeleter(
+          data_alloc_released);
+      paddle::memory::allocation::Allocator::AllocationDeleter(
+          col_alloc_released);
+    });
+#endif
+  }
+};
+
+template <typename T>
+class SplitFunctor<phi::GPUContext, T> {
+ public:
+  void operator()(const phi::GPUContext& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs) {
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+    // tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+
+    // TODO(zcd): Add input data validity checking
+    int o_num = outputs->size();
+    int64_t out_row = 1;
+    auto dim_0 = ref_inputs[0]->dims();
+    for (int i = 0; i < axis; ++i) {
+      out_row *= dim_0[i];
+    }
+
+    int64_t out0_col = ref_inputs[0]->numel() / out_row;
+    int64_t in_col = 0, in_row = out_row;
+    bool has_same_shape = true;
+
+    int outputs_cols_num = o_num + 1;
+    std::vector<T*> outputs_data_vec(o_num);
+    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
+    T** outputs_data = outputs_data_vec.data();
+    int64_t* outputs_cols = outputs_cols_vec.data();
+
+// There are some differences between hip runtime and NV runtime.
+// In NV, when the pageable memory data less than 64K is transferred from
+// hosttodevice, it will be automatically asynchronous.
+// However, only pinned memory in hip can copy asynchronously
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
+// 3.2.6.1. Concurrent Execution between Host and Device
+// Memory copies from host to device of a memory block of 64 KB or less
+#ifdef PADDLE_WITH_HIP
+    paddle::memory::AllocationPtr data_alloc, cols_alloc;
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
+    data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                       o_num * sizeof(T*));
+    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
+    cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                       (outputs_cols_num) * sizeof(int64_t));
+    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
+#endif
+
+    outputs_cols[0] = 0;
+    for (int i = 0; i < o_num; ++i) {
+      int64_t t_col = ref_inputs.at(i)->numel() / out_row;
+      if (has_same_shape) {
+        if (t_col != out0_col) has_same_shape = false;
+      }
+      in_col += t_col;
+      outputs_cols[i + 1] = in_col;
+      if (outputs->at(i) != nullptr) {
+        outputs_data[i] = outputs->at(i)->data<T>();
+      } else {
+        outputs_data[i] = nullptr;
+      }
+    }
+
+    dim3 block_dims;
+    dim3 grid_dims;
+    GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
+
+    paddle::memory::allocation::AllocationPtr tmp_dev_outs_data;
+    T** dev_out_gpu_data = nullptr;
+    if (!has_same_shape || o_num < 2 || o_num > 4) {
+      // TODO(chentianyu03): try to find a method to remove the Alloc function
+      tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*));
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          outputs_data, o_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_outs_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           o_num * sizeof(T*),
+                           context.stream());
+      dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+    }
+
+    if (has_same_shape) {
+      if (o_num == 2) {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(),
+            in_row,
+            in_col,
+            out0_col,
+            outputs_data[0],
+            outputs_data[1]);
+      } else if (o_num == 3) {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(),
+            in_row,
+            in_col,
+            out0_col,
+            outputs_data[0],
+            outputs_data[1],
+            outputs_data[2]);
+      } else if (o_num == 4) {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(),
+            in_row,
+            in_col,
+            out0_col,
+            outputs_data[0],
+            outputs_data[1],
+            outputs_data[2],
+            outputs_data[3]);
+      } else {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
+      }
+    } else {
+      auto tmp_dev_ins_col_data =
+          // TODO(chentianyu03): try to find a method to remove the Alloc
+          // function
+          paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          outputs_cols, outputs_cols_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_ins_col_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           outputs_cols_num * sizeof(int64_t),
+                           context.stream());
+      int64_t* dev_outs_col_data =
+          reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
+
+      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+          input.data<T>(),
+          in_row,
+          in_col,
+          dev_outs_col_data,
+          static_cast<int>(outputs_cols_num),
+          dev_out_gpu_data);
+    }
+#ifdef PADDLE_WITH_HIP
+    // Prevent the pinned memory value from being covered and release the memory
+    // after the launch kernel of the stream is executed (reapply pinned memory
+    // next time)
+    auto* data_alloc_released = data_alloc.release();
+    auto* cols_alloc_released = cols_alloc.release();
+    context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
+      paddle::memory::allocation::Allocator::AllocationDeleter(
+          data_alloc_released);
+      paddle::memory::allocation::Allocator::AllocationDeleter(
+          cols_alloc_released);
+    });
+#endif
+  }
+};
+
+#define DEFINE_FUNCTOR(type)                           \
+  template class ConcatFunctor<phi::GPUContext, type>; \
+  template class SplitFunctor<phi::GPUContext, type>
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.h b/paddle/phi/kernels/funcs/concat_and_split_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cb15fe539b66b8a6fddccf18d92b95976db2a65
--- /dev/null
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+namespace funcs {
+
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+template <typename Context, typename T>
+struct ConcatFunctor {
+  void operator()(const Context& context,
+                  const std::vector<phi::DenseTensor>& input,
+                  int axis,
+                  phi::DenseTensor* output);
+};
+
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <typename Context, typename T>
+class SplitFunctor {
+ public:
+  void operator()(const Context& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs);
+};
+
+}  // namespace funcs
+}  // namespace phi
+
+#define FOR_ALL_TYPES(macro)         \
+  macro(int);                        \
+  macro(float);                      \
+  macro(double);                     \
+  macro(bool);                       \
+  macro(int64_t);                    \
+  macro(int16_t);                    \
+  macro(uint8_t);                    \
+  macro(int8_t);                     \
+  macro(phi::dtype::float16);        \
+  macro(phi::dtype::bfloat16);       \
+  macro(phi::dtype::complex<float>); \
+  macro(phi::dtype::complex<double>);
diff --git a/paddle/phi/kernels/funcs/concat_funcs.h b/paddle/phi/kernels/funcs/concat_funcs.h
index 63f0c8058acc16f1665bda7d6a2b91cdc24ef2b0..70e3545b981fa7841f56a5a9ec2a9d4890b17d79 100644
--- a/paddle/phi/kernels/funcs/concat_funcs.h
+++ b/paddle/phi/kernels/funcs/concat_funcs.h
@@ -23,7 +23,7 @@ static inline int64_t ComputeAxis(int64_t axis, int64_t rank) {
   PADDLE_ENFORCE_EQ(
       axis >= -rank && axis < rank,
       true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The axis is expected to be in range of [%d, %d), but got %d",
           -rank,
           rank,
@@ -42,17 +42,17 @@ static inline phi::DDim ComputeAndCheckShape(
   auto out_dims = inputs_dims[0];
   size_t in_zero_dims_size = out_dims.size();
   for (size_t i = 1; i < n; i++) {
-    PADDLE_ENFORCE_EQ(inputs_dims[i].size(),
-                      out_dims.size(),
-                      paddle::platform::errors::InvalidArgument(
-                          "The shape of input[0] and input[%d] "
-                          "is expected to be equal."
-                          "But received input[0]'s shape = "
-                          "[%s], input[%d]'s shape = [%s].",
-                          i,
-                          inputs_dims[0],
-                          i,
-                          inputs_dims[i]));
+    PADDLE_ENFORCE_EQ(
+        inputs_dims[i].size(),
+        out_dims.size(),
+        phi::errors::InvalidArgument("The shape of input[0] and input[%d] "
+                                     "is expected to be equal."
+                                     "But received input[0]'s shape = "
+                                     "[%s], input[%d]'s shape = [%s].",
+                                     i,
+                                     inputs_dims[0],
+                                     i,
+                                     inputs_dims[i]));
     for (size_t j = 0; j < in_zero_dims_size; j++) {
       if (j == axis) {
         if (is_runtime) {
@@ -71,7 +71,7 @@ static inline phi::DDim ComputeAndCheckShape(
           // check all shape in run time
           PADDLE_ENFORCE_EQ(inputs_dims[0][j],
                             inputs_dims[i][j],
-                            paddle::platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "The %d-th dimension of input[0] and input[%d] "
                                 "is expected to be equal."
                                 "But received input[0]'s shape = "
@@ -92,4 +92,4 @@ static inline phi::DDim ComputeAndCheckShape(
 }
 
 }  // namespace funcs
-}  // namespace  pten
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diag_functor.h b/paddle/phi/kernels/funcs/diag_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..a806d1583a0b363d44aa9f0cf3b3a64f4a8ea6ff
--- /dev/null
+++ b/paddle/phi/kernels/funcs/diag_functor.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+inline int ComputeStride(int axis, phi::DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index a82c4f66d010273f0f09fa71a38c3081fd1bc2ee..19a93970d090af060b888f512782975b073fff72 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -22,8 +22,8 @@
 
 #include <algorithm>
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 namespace funcs {
@@ -118,7 +118,7 @@ DenseTensor Diagonal(const DeviceContext& context,
 #endif
 
     // auto& dev_ctx = context.template device_context<DeviceContext>();
-    paddle::platform::ForRange<DeviceContext> for_range(context, diag.numel());
+    phi::funcs::ForRange<DeviceContext> for_range(context, diag.numel());
     DiagonalFunctor<T> functor(
         input_data, diag_arr, ret_arr, pos, dim_size, diag_data);
     for_range(functor);
diff --git a/paddle/phi/kernels/funcs/eigen/common.h b/paddle/phi/kernels/funcs/eigen/common.h
index dc64d3b122f1014ddfed081269859d46c26f43ad..d34427df0e499b78fccdfe80660277152560e34d 100644
--- a/paddle/phi/kernels/funcs/eigen/common.h
+++ b/paddle/phi/kernels/funcs/eigen/common.h
@@ -21,7 +21,7 @@ limitations under the License. */
 
 namespace phi {
 
-// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+// EigenDim converts phi::DDim into Eigen::DSizes.
 template <int D>
 struct EigenDim {
   using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
@@ -29,7 +29,7 @@ struct EigenDim {
   static Type From(const DDim& dims) {
     PADDLE_ENFORCE_EQ(arity(dims),
                       D,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input dimension size should be equal to %d, but "
                           "received dimension size is %d.",
                           arity(dims),
@@ -42,7 +42,7 @@ struct EigenDim {
   }
 };
 
-// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+// Interpret phi::Tensor as EigenTensor and EigenConstTensor.
 template <typename T,
           size_t D,
           int MajorType = Eigen::RowMajor,
@@ -86,7 +86,7 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
     int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
                       true,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input dimension number(num_col_dims) must be "
                           "between 0 and %d, but received number is %d.",
                           rank,
@@ -100,7 +100,7 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
     int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
                       true,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input dimension number(num_col_dims) must be "
                           "between 0 and %d, but received number is %d.",
                           rank,
diff --git a/paddle/phi/kernels/funcs/eigen/elementwise.cu b/paddle/phi/kernels/funcs/eigen/elementwise.cu
index 96d2ddba03c28df95331832ba2d4aa8e352d2f2b..3855ba8ccf94562f5c1b8ea2ca0e471fdb3f943d 100644
--- a/paddle/phi/kernels/funcs/eigen/elementwise.cu
+++ b/paddle/phi/kernels/funcs/eigen/elementwise.cu
@@ -55,5 +55,5 @@ struct EigenSub<Eigen::GpuDevice, T> {
 
 template struct EigenSub<Eigen::GpuDevice, float>;
 
-}  // namespace fucns
+}  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h
index 5fc8f76d988d1449cc41a89a1740ffeb9a3b05df..fbb9d8e3d2ef552750fc98d10a63d230661adf49 100644
--- a/paddle/phi/kernels/funcs/eigen/extensions.h
+++ b/paddle/phi/kernels/funcs/eigen/extensions.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#ifndef __xpu__
+
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
@@ -435,3 +437,5 @@ HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
 
 }  // namespace numext
 }  // namespace Eigen
+
+#endif  // __xpu__
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 9fb2dac6c425f6224da713fb6ada636355b42c26..d369781f845eb0887817f83be761b1027fc0bab0 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -14,19 +14,20 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
+#define HOSTDEVICE __host__ __device__
 namespace kps = phi::kps;
 
 #endif
@@ -343,7 +344,7 @@ inline void get_mid_dims(const DDim &x_dims,
     if (x_dims[i + axis] != y_dims[i]) {
       PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
                         true,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Broadcast dimension mismatch. Operands "
                             "could not be broadcast together with the shape of "
                             "X = [%s] and the shape of Y = [%s]. Received [%d] "
@@ -417,7 +418,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
                                     DX_OP dx_op,
                                     DY_OP dy_op) {
   size_t N = static_cast<size_t>(phi::product(x_dim));
-  paddle::platform::ForRange<DeviceContext> for_range(dev_ctx, N);
+  phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
   for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
       x.data<T>(),
       y.data<T>(),
@@ -436,7 +437,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 // static unroller
 template <template <int Index, int VecSize> typename Func,
@@ -469,10 +470,14 @@ struct Loader {
     kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f));
     if (is_boundary) {
       kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
-          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+          args,
+          reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
+          num);
     } else {
       kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
-          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+          args,
+          reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
+          num);
     }
   }
 };
@@ -482,8 +487,7 @@ struct InputSetter {
   template <typename Array>
   static HOSTDEVICE void Apply(
       const std::vector<const DenseTensor *> &ins_tensor, Array *ins_data) {
-    (*ins_data)[Index] =
-        reinterpret_cast<const _ptr_ char *>(ins_tensor[Index]->data());
+    (*ins_data)[Index] = (const _ptr_ char *)(ins_tensor[Index]->data());
   }
 };
 
@@ -718,9 +722,9 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
 
   Unroller<InputSetter, VecSize, Arity>::step(ins, &ins_data);
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
+    outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
   }
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
   int block_size = 64;
   int grid_size = 8;
   auto stream = ctx.x_context()->xpu_stream;
@@ -754,7 +758,7 @@ void ElementwiseKernel(const KPDevice &ctx,
   const int kArity = Traits::arity;
   PADDLE_ENFORCE_EQ(ins.size(),
                     kArity,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The number of inputs is expected to be equal to the "
                         "arity of functor. But recieved: the number of inputs "
                         "is %d, the arity of functor is %d.",
@@ -762,7 +766,7 @@ void ElementwiseKernel(const KPDevice &ctx,
                         kArity));
   PADDLE_ENFORCE_EQ(outs->size(),
                     NumOuts,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Number of outputs shall equal to number of functions, "
                         "but number of outputs is %d, of functions is %d.",
                         outs->size(),
@@ -773,7 +777,7 @@ void ElementwiseKernel(const KPDevice &ctx,
       PADDLE_ENFORCE_EQ(
           (*outs)[i]->dims(),
           (*outs)[0]->dims(),
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of each output tensor shall be identical yet, "
               "but %dth output tensor`s shape is not.",
               i));
@@ -796,7 +800,7 @@ void ElementwiseKernel(const KPDevice &ctx,
           ctx, ins, outs, func);
       break;
     default: {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "Unsupported vectorized size: %d !", vec_size));
       break;
     }
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf0888c301fe739994089b8e05357bd810455756
--- /dev/null
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context>
+struct ForRange {
+  ForRange(const Context& dev_ctx, size_t limit);
+
+  template <typename Function>
+  void operator()(Function func) const;
+};
+
+template <>
+struct ForRange<phi::CPUContext> {
+  ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    for (size_t i = 0; i < limit_; ++i) {
+      func(i);
+    }
+  }
+
+  size_t limit_;
+};
+
+// NOTE: After the pten kernel is migrated, it needs to be deleted.
+template <>
+struct ForRange<paddle::platform::CPUDeviceContext> {
+  ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx_, limit_);
+    for_range(func);
+  }
+
+  const paddle::platform::CPUDeviceContext& dev_ctx_;
+  size_t limit_;
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
+  size_t idx = static_cast<size_t>(threadIdx.x);
+  func(idx);
+}
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < limit) {
+    func(idx);
+  }
+}
+
+template <>
+struct ForRange<phi::GPUContext> {
+  ForRange(const phi::GPUContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(limit) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+#ifdef __HIPCC__
+    // HIP will throw core dump when threads > 256
+    constexpr int num_threads = 256;
+#elif WITH_NV_JETSON
+    // JETSON_NANO will throw core dump when threads > 128
+    int num_thread = 256;
+    backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128);
+    const int num_threads = num_thread;
+#else
+    constexpr int num_threads = 1024;
+#endif
+    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const phi::GPUContext& dev_ctx_;
+  size_t limit_;
+};
+
+// NOTE: After the pten kernel is migrated, it needs to be deleted.
+template <>
+struct ForRange<paddle::platform::CUDADeviceContext> {
+  ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(limit) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+    phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx_, limit_);
+    for_range(func);
+  }
+
+  const paddle::platform::CUDADeviceContext& dev_ctx_;
+  size_t limit_;
+};
+
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/functors.h b/paddle/phi/kernels/funcs/functors.h
index 5657bb047d7aa3a9b0f65d845d03e04c5b3636ae..d518a877b26f2c3d295eb0ceda8d4b862006e633 100644
--- a/paddle/phi/kernels/funcs/functors.h
+++ b/paddle/phi/kernels/funcs/functors.h
@@ -38,12 +38,15 @@ struct AddGradFunctor {
 
 template <typename T>
 struct ScaleFunctor {
-  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
+  using MT = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  explicit ScaleFunctor(const MT coeff) : coeff_(coeff) {}
 
-  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
+  inline HOSTDEVICE T operator()(T ele) {
+    return static_cast<T>(static_cast<MT>(ele) * coeff_);
+  }
 
  private:
-  T coeff_;
+  MT coeff_;
 };
 
 template <typename T>
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 8aed099d9f2433284d83cb9d0c18a70e1415cf8f..4201a75be8ac7ee9f7e633f6def1e002ce4b7e8a 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -184,7 +184,7 @@ struct TensorSetConstantCPU {
       : tensor_(tensor), value_(value) {}
   template <typename T>
   void apply() const {
-    auto cpu = paddle::platform::CPUPlace();
+    auto cpu = phi::CPUPlace();
     auto* begin = tensor_->mutable_data<T>(cpu);
     std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
   }
@@ -197,8 +197,7 @@ void set_constant_with_place<paddle::platform::XPUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("XPUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("XPUPlace is not supported"));
 }
 
 template <>
@@ -206,8 +205,7 @@ void set_constant_with_place<paddle::platform::NPUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("NPUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("NPUPlace is not supported"));
 }
 
 template <>
@@ -215,8 +213,7 @@ void set_constant_with_place<paddle::platform::NPUPinnedPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(paddle::platform::errors::Unimplemented(
-      "NPUPinnedPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
 }
 
 template <>
@@ -224,8 +221,7 @@ void set_constant_with_place<paddle::platform::IPUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("IPUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("IPUPlace is not supported"));
 }
 
 template <>
@@ -233,12 +229,11 @@ void set_constant_with_place<paddle::platform::CustomPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("CustomPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("CustomPlace is not supported"));
 }
 
 template <>
-void set_constant_with_place<paddle::platform::CPUPlace>(
+void set_constant_with_place<phi::CPUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
@@ -250,8 +245,7 @@ void set_constant_with_place<paddle::platform::MLUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("MLUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported"));
 }
 
 template <>
@@ -286,7 +280,7 @@ void set_constant(const paddle::platform::DeviceContext& context,
   // tensor->place().apply_visitor(func);
   paddle::platform::VisitPlace(tensor->place(), func);
 #else
-  func(paddle::platform::CPUPlace());
+  func(phi::CPUPlace());
 #endif
 }
 
@@ -302,7 +296,7 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         vector.numel(),
         size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The input vector size"
             " should be equal to the size of each row of input tensor."
             " Expected vector size=%d, but received %d",
@@ -312,7 +306,7 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
     const char* out_dims_cstr = out_dims.to_str().c_str();
     PADDLE_ENFORCE_EQ(out_dims,
                       in_dims,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The output tensor shape should be same as the input"
                           " tensor shape. Expected output tensor shape: %s,"
                           " but received %s",
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index 0b2b53c28c984527a8e4199ed6dc92ab0b50f3f9..ae368a005f057994d9f2c4a91188358aa26e09c2 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -257,7 +257,7 @@ struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         vector.numel(),
         size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The input vector size"
             " should be equal to the size of each row of input tensor."
             " Expected vector size=%d, but received %d",
@@ -268,7 +268,7 @@ struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         out_dims,
         in_dims,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The output tensor shape should be same as the input tensor"
             " shape. Expected output tensor shape: %s,"
             " but received %s",
@@ -303,7 +303,7 @@ void ColwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(),
                     size,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of input vector"
                         " should be equal to the size of input tensor column"
                         " dimension. Expected vector size=%d, but received %d",
@@ -339,7 +339,7 @@ void RowwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(),
                     in_dims[0],
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of input vector"
                         " should be equal to the size of input tensor row"
                         " dimension. Expected vector size=%d, but received %d",
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 7f581c395cc713f702bcf8f512f3e1f1ca764a32..8e1a4cdd1a9688a12d7f0a8b5ba088f6abfc9512 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -115,7 +115,7 @@ struct TensorSetConstantXPU {
     std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
     paddle::memory::Copy(place_,
                          begin,
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                          static_cast<void*>(data_cpu.get()),
                          numel * sizeof(T));
   }
diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h
index b099c6d411602126e72d3b5fdfb3107f92b2bd2f..1638d03e50f95a9338aff0d25bd41dd5d95e9738 100644
--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -74,7 +74,7 @@ void ColwiseSum<DeviceContext, T>::operator()(
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(out->numel(),
                     size,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor column"
                         " dimension. Expected output size=%d, but received %d",
@@ -102,7 +102,7 @@ class ColwiseSum<paddle::platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         out->numel(),
         size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor column"
             " dimension. Expected output size=%d, but received %d",
@@ -130,15 +130,14 @@ void RowwiseMean<DeviceContext, T>::operator()(
     const paddle::framework::Tensor& input,
     paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(),
-      2U,
-      paddle::platform::errors::InvalidArgument("The rank of input tensor "
-                                                "should be 2, but received %d",
-                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    2U,
+                    phi::errors::InvalidArgument("The rank of input tensor "
+                                                 "should be 2, but received %d",
+                                                 in_dims.size()));
   PADDLE_ENFORCE_EQ(out->numel(),
                     in_dims[0],
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor row"
                         " dimension. Expected output size=%d, but received %d",
@@ -161,18 +160,18 @@ class RowwiseMean<paddle::platform::CPUDeviceContext, T> {
                   const paddle::framework::Tensor& input,
                   paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(),
-                      2U,
-                      paddle::platform::errors::InvalidArgument(
-                          "The rank of input tensor "
-                          "should be 2, but received %d",
-                          in_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(),
+        2U,
+        phi::errors::InvalidArgument("The rank of input tensor "
+                                     "should be 2, but received %d",
+                                     in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
         out->numel(),
         height,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor row"
             " dimension. Expected output size=%d, but received %d",
@@ -198,15 +197,14 @@ void RowwiseSum<DeviceContext, T>::operator()(
     const paddle::framework::Tensor& input,
     paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(),
-      2U,
-      paddle::platform::errors::InvalidArgument("The rank of input tensor "
-                                                "should be 2, but received %d",
-                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    2U,
+                    phi::errors::InvalidArgument("The rank of input tensor "
+                                                 "should be 2, but received %d",
+                                                 in_dims.size()));
   PADDLE_ENFORCE_EQ(out->numel(),
                     in_dims[0],
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor row"
                         " dimension. Expected output size=%d, but received %d",
@@ -229,18 +227,18 @@ class RowwiseSum<paddle::platform::CPUDeviceContext, T> {
                   const paddle::framework::Tensor& input,
                   paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(),
-                      2U,
-                      paddle::platform::errors::InvalidArgument(
-                          "The rank of input tensor "
-                          "should be 2, but received %d",
-                          in_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(),
+        2U,
+        phi::errors::InvalidArgument("The rank of input tensor "
+                                     "should be 2, but received %d",
+                                     in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
         out->numel(),
         height,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor row"
             " dimension. Expected output size=%d, but received %d",
diff --git a/paddle/fluid/operators/multinomial_op.h b/paddle/phi/kernels/funcs/multinomial_functor.h
similarity index 58%
rename from paddle/fluid/operators/multinomial_op.h
rename to paddle/phi/kernels/funcs/multinomial_functor.h
index 077e0e0ffa57e39af5bb0420357ccf9a1298f473..05a5a0faf6774650facc082d9a04a46866e61db5 100644
--- a/paddle/fluid/operators/multinomial_op.h
+++ b/paddle/phi/kernels/funcs/multinomial_functor.h
@@ -1,10 +1,10 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+     http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,30 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <vector>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-/**
- * Samples a multinomial distribution given a probability input
- */
-
-template <typename T>
-void MultinomialFunctor(int64_t* out_data, const T* in_data,
-                        const int64_t num_samples, const bool replacement,
+
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T, typename Context>
+void MultinomialFunctor(const Context& dev_ctx,
+                        int64_t* out_data,
+                        const T* in_data,
+                        const int64_t num_samples,
+                        const bool replacement,
                         const int64_t num_categories,
                         const int64_t num_distributions) {
   std::vector<T> cumulative_probs(num_categories);
 
   std::uniform_real_distribution<T> dist(0, 1);
-  auto gen_ptr = framework::DefaultCPUGenerator();
-  auto engine = gen_ptr->GetCPUEngine();
+  auto engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
 
   for (int64_t i = 0; i < num_distributions; i++) {
     T probs_sum = 0;
@@ -44,11 +39,12 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
     int64_t num_zeros = 0;
     for (int64_t j = 0; j < num_categories; j++) {
       prob_value = in_data[i * num_categories + j];
-      PADDLE_ENFORCE_GE(prob_value, 0.0,
-                        platform::errors::InvalidArgument(
-                            "The input of multinomial distribution "
-                            "should be >= 0, but got %f.",
-                            prob_value));
+      PADDLE_ENFORCE_GE(
+          prob_value,
+          0.0,
+          errors::InvalidArgument("The input of multinomial distribution "
+                                  "should be >= 0, but got %f.",
+                                  prob_value));
 
       probs_sum += prob_value;
       if (prob_value == 0) {
@@ -56,17 +52,18 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
       }
       cumulative_probs[j] = probs_sum;
     }
-    PADDLE_ENFORCE_GT(probs_sum, 0.0,
-                      platform::errors::InvalidArgument(
-                          "The sum of one multinomial distribution "
-                          "probability should be > 0, but got %f.",
-                          probs_sum));
+    PADDLE_ENFORCE_GT(
+        probs_sum,
+        0.0,
+        errors::InvalidArgument("The sum of one multinomial distribution "
+                                "probability should be > 0, but got %f.",
+                                probs_sum));
     PADDLE_ENFORCE_EQ(
-        (replacement || (num_categories - num_zeros >= num_samples)), true,
-        platform::errors::InvalidArgument(
-            "When replacement is False, number of "
-            "samples should be less than non-zero "
-            "categories."));
+        (replacement || (num_categories - num_zeros >= num_samples)),
+        true,
+        errors::InvalidArgument("When replacement is False, number of "
+                                "samples should be less than non-zero "
+                                "categories."));
 
     for (int64_t j = 0; j < num_categories; j++) {
       cumulative_probs[j] /= probs_sum;
@@ -121,8 +118,5 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
   }
 }
 
-template <typename DeviceContext, typename T>
-class MultinomialOpKernel;
-
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/unfold_functor.h b/paddle/phi/kernels/funcs/unfold_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bd5437a7f10a264426b8e87f1423d2b1bc454de
--- /dev/null
+++ b/paddle/phi/kernels/funcs/unfold_functor.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+//////// CalcOutputSize Functor ///////
+inline int CalcOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int padding1,
+                          int padding2,
+                          int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
+  return output_size;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/abs_grad_kernel.cu b/paddle/phi/kernels/gpu/abs_grad_kernel.cu
index 37b19278233a8728ab444ea3dd97cd623742f730..1ce6a1638b1a04fa8e21adb386b1c92bd57296a2 100644
--- a/paddle/phi/kernels/gpu/abs_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_grad_kernel.cu
@@ -20,7 +20,7 @@
 
 using phi::dtype::complex;
 
-PT_REGISTER_KERNEL(abs_grad,
+PD_REGISTER_KERNEL(abs_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AbsGradKernel,
@@ -31,7 +31,7 @@ PT_REGISTER_KERNEL(abs_grad,
                    phi::dtype::float16,
                    complex<float>,
                    complex<double>) {}
-PT_REGISTER_KERNEL(abs_double_grad,
+PD_REGISTER_KERNEL(abs_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AbsDoubleGradKernel,
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index 5c191dfc992a526b5418892e612243fac6bf766f..e122e6b1e9c8abe977ec5688a2ffddecadc776fb 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -52,7 +52,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(abs,
+PD_REGISTER_KERNEL(abs,
                    GPU,
                    ALL_LAYOUT,
                    phi::AbsKernel,
diff --git a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..65978da1374e4888afe8a7b408b0bb5a70d92b66
--- /dev/null
+++ b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    addmm_grad, GPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/addmm_kernel.cu b/paddle/phi/kernels/gpu/addmm_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b589ce20acca5c6cf51fd16ea223ef6b0d17466
--- /dev/null
+++ b/paddle/phi/kernels/gpu/addmm_kernel.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(addmm, GPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {}
diff --git a/paddle/fluid/operators/erfinv_op.cu b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
similarity index 51%
rename from paddle/fluid/operators/erfinv_op.cu
rename to paddle/phi/kernels/gpu/atan2_grad_kernel.cu
index 1fb2dbb97a2df60783bb84b88816823ec1afd9a2..1cc3311c3639820ef9b6d3a29d9274ac93bb5963 100644
--- a/paddle/fluid/operators/erfinv_op.cu
+++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
@@ -12,17 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/erfinv_op.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    erfinv,
-    paddle::operators::ErfinvKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ErfinvKernel<paddle::platform::CUDADeviceContext,
-                                    double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    erfinv_grad,
-    paddle::operators::ErfinvGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    paddle::operators::ErfinvGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>);
+PD_REGISTER_KERNEL(atan2_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Atan2GradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/atan2_kernel.cu b/paddle/phi/kernels/gpu/atan2_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..702c959b78f75d0e52511d9bdc9d4330c6838aa4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/atan2_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
+PD_REGISTER_KERNEL(atan2,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Atan2Kernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..94eabac4d1306ed170e54fc4945b4ca35481126c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bce_loss_grad_kernel.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+
+template <typename T>
+struct BCELossGradFunctor {
+  T one;
+  T eps;
+
+  HOSTDEVICE inline BCELossGradFunctor() {
+    one = static_cast<T>(1.0f);
+    eps = static_cast<T>(1e-12);
+  }
+
+  HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const {
+    T term1 = max((one - x) * x, eps);
+    return (dout * (x - label) / term1);
+  }
+};
+
+template <typename T, typename Context>
+void BCELossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       DenseTensor* input_grad) {
+  dev_ctx.template Alloc<T>(input_grad);
+  std::vector<const DenseTensor*> ins = {&input, &label, &out_grad};
+  std::vector<DenseTensor*> outs = {input_grad};
+  auto functor = BCELossGradFunctor<T>();
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bce_loss_grad, GPU, ALL_LAYOUT, phi::BCELossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..adbcd3b2b62077ac1d3117c98c94db3e31b781dc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bce_loss_kernel.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T>
+struct BCELossFunctor {
+  T one;
+  T neg_100;
+
+  HOSTDEVICE inline BCELossFunctor() {
+    one = static_cast<T>(1.0f);
+    neg_100 = static_cast<T>(-100.);
+  }
+
+  HOSTDEVICE inline T operator()(const T x, const T label) const {
+    PADDLE_ENFORCE(
+        (x >= static_cast<T>(0)) && (x <= one),
+        "Input is expected to be within the interval [0, 1], but recieved %f.",
+        x);
+    T term1 = max(phi::kps::details::Log(x), neg_100);
+    T term2 = max(phi::kps::details::Log(one - x), neg_100);
+    return (((label - one) * term2) - (label * term1));
+  }
+};
+
+template <typename T, typename Context>
+void BCELossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&input, &label};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = BCELossFunctor<T>();
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bce_loss, GPU, ALL_LAYOUT, phi::BCELossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index b043a55e21b611b254ec46360d11342bce851c57..ac69d398b8ac44513625d2caeac2d80d5578ea6a 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -12,19 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thrust/execution_policy.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#endif
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bernoulli_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/platform/transform.h"
 
+DECLARE_bool(use_curand);
+
 namespace phi {
 
 template <typename T>
@@ -49,29 +60,72 @@ struct BernoulliCudaFunctor {
   }
 };
 
+// 'curand_uniform4/hiprand_uniform4' generate 4 random number each time
+template <typename T>
+__global__ void bernoulli_cuda_kernel(
+    size_t size, uint64_t seed, uint64_t offset, const T* x_data, T* out_data) {
+  size_t thread_idx =
+      static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+#if defined(__NVCC__)
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, thread_idx, offset, &state);
+#else
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, thread_idx, offset, &state);
+#endif
+
+  size_t total_thread = gridDim.x * blockDim.x;
+  for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) {
+    paddle::distribution::uniform_distribution<float> dist;
+    float4 rand = dist(&state);
+#pragma unroll
+    for (size_t j = 0; j < 4; j++) {
+      size_t idx = i + j;
+      if (idx < size) {
+        out_data[idx] = static_cast<T>((&rand.x)[j] <= x_data[idx]);
+      }
+    }
+  }
+}
+
 template <typename T, typename Context>
 void BernoulliKernel(const Context& ctx,
                      const DenseTensor& x,
                      DenseTensor* out) {
-  auto numel = x.numel();
-  auto* x_data = x.data<T>();
+  const T* x_data = x.data<T>();
   T* out_data = ctx.template Alloc<T>(out);
+  auto numel = x.numel();
 
   auto gen_cuda = ctx.GetGenerator();
-  auto seed_offset = gen_cuda->IncrementOffset(1);
-  int64_t gen_offset = numel * seed_offset.second;
-  paddle::platform::Transform<phi::GPUContext> trans;
-  thrust::counting_iterator<int64_t> index_sequence_begin(0);
-  trans(ctx,
-        index_sequence_begin,
-        index_sequence_begin + numel,
-        x_data,
-        out_data,
-        BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
-                                static_cast<int64_t>(gen_offset)));
+
+  if (FLAGS_use_curand) {
+    auto seed_offset = gen_cuda->IncrementOffset(12);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, 4);
+    size_t grid_size = gpu_config.GetGridSize();
+    size_t block_size = gpu_config.GetBlockSize();
+
+    bernoulli_cuda_kernel<<<grid_size, block_size, 0, ctx.stream()>>>(
+        numel, seed, offset, x_data, out_data);
+  } else {
+    auto seed_offset = gen_cuda->IncrementOffset(1);
+    int64_t gen_offset = numel * seed_offset.second;
+    paddle::platform::Transform<phi::GPUContext> trans;
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+    trans(ctx,
+          index_sequence_begin,
+          index_sequence_begin + numel,
+          x_data,
+          out_data,
+          BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
+                                  static_cast<int64_t>(gen_offset)));
+  }
 }
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     bernoulli, GPU, ALL_LAYOUT, phi::BernoulliKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f4f69ee83eea14d31d67b600480ff5b1e5bad338
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(bilinear_tensor_product_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BilinearTensorProductGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b81b842cedba232d25059876e06dd47479d513d6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bilinear_tensor_product_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(bilinear_tensor_product,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BilinearTensorProductKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index c05cd15b4757a3064f3e1eeec7ee724439115276..7a6c99c5fe15f6ddecd190d2d77e359503be7a80 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -61,7 +61,7 @@ void CastKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL(cast,                              \
+  PD_REGISTER_KERNEL(cast,                              \
                      GPU,                               \
                      ALL_LAYOUT,                        \
                      phi::CastKernel,                   \
diff --git a/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9165e8ea4147ff02bfe8a84d8fc24e92a6826025
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
@@ -0,0 +1,22 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    cholesky_grad, GPU, ALL_LAYOUT, phi::CholeskyGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..22ea87d83e8db924b81d75d81820ec5e4c3ed782
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -0,0 +1,217 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/cholesky_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+struct MatrixBandPartFunctor {
+  /*! Set output as input value outside a central band and 0 inside that band.
+   * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n]
+   * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper
+   * < 0 || (n-m) <= num_upper)
+   */
+  MatrixBandPartFunctor(const int m,
+                        const int n,
+                        const int num_lower_diags,
+                        const int num_upper_diags,
+                        const T* input,
+                        T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = static_cast<T>(0);
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const T* input_;
+  T* output_;
+};
+
+#define FUNC_WITH_TYPES(m) m(float, S) m(double, D)
+
+#define POTRF_INSTANCE(T, C)                                             \
+  void Potrf(const GPUContext& dev_ctx,                                  \
+             cublasFillMode_t uplo,                                      \
+             int n,                                                      \
+             T* A,                                                       \
+             int lda,                                                    \
+             int* info) {                                                \
+    auto handle = dev_ctx.cusolver_dn_handle();                          \
+    int workspace_size = 0;                                              \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
+        handle, uplo, n, A, lda, &workspace_size));                      \
+    auto workspace = paddle::memory::Alloc(dev_ctx, workspace_size);     \
+    T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());           \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf(            \
+        handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));  \
+  }
+
+FUNC_WITH_TYPES(POTRF_INSTANCE);
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+#define POTRF_BATCH_INSTANCE(T, C)                                   \
+  void PotrfBatched(const GPUContext& dev_ctx,                       \
+                    cublasFillMode_t uplo,                           \
+                    int n,                                           \
+                    T* Aarray[],                                     \
+                    int lda,                                         \
+                    int* info_array,                                 \
+                    int batch_size) {                                \
+    auto handle = dev_ctx.cusolver_dn_handle();                      \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrfBatched( \
+        handle, uplo, n, Aarray, lda, info_array, batch_size));      \
+  }
+
+FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
+#endif
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out) {
+  auto& dims = x.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  int m = dims[dims.size() - 1];
+  int tensor_size = batch_count * m * m;
+
+  const auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+
+  // matrices are assumed to be stored in column-major order in cusolver
+  cublasFillMode_t uplo =
+      upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  // portf is inplace, thus copy the triangular part of the input matrices to
+  // the output and set the other triangular part to 0 firstly
+  paddle::platform::ForRange<GPUContext> for_range(dev_ctx, tensor_size);
+  if (upper) {
+    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                      m,
+                                                      /* num_lower_diags */ 0,
+                                                      /* num_upper_diags */ m,
+                                                      x_data,
+                                                      out_data);
+    for_range(matrix_band_part_functor);
+  } else {
+    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                      m,
+                                                      /* num_lower_diags */ m,
+                                                      /* num_upper_diags */ 0,
+                                                      x_data,
+                                                      out_data);
+    for_range(matrix_band_part_functor);
+  }
+
+  auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_count);
+  auto* info_ptr = reinterpret_cast<int*>(info->ptr());
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+  if (batch_count > 1) {
+    std::vector<T*> output_ptrs;
+    for (int i = 0; i < batch_count; i++) {
+      output_ptrs.emplace_back(out_data + i * m * m);
+    }
+    thrust::device_vector<T*> dev_output_ptrs(output_ptrs.begin(),
+                                              output_ptrs.end());
+    PotrfBatched(dev_ctx,
+                 uplo,
+                 m,
+                 thrust::raw_pointer_cast(dev_output_ptrs.data()),
+                 m,
+                 info_ptr,
+                 batch_count);
+    // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need
+    // to clear the upper triangle of the output. Remove this workaround once
+    // the bug is fixed.
+    if (!upper) {
+      MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                        m,
+                                                        /* num_lower_diags */ m,
+                                                        /* num_upper_diags */ 0,
+                                                        out_data,
+                                                        out_data);
+      for_range(matrix_band_part_functor);
+    }
+  } else {
+#endif
+    for (int i = 0; i < batch_count; i++) {
+      Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i);
+    }
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+  }
+#endif
+  // check the info
+  std::vector<int> error_info;  // only for checking positive matrix
+  error_info.resize(batch_count);
+
+  paddle::memory::Copy(CPUPlace(),
+                       error_info.data(),
+                       dev_ctx.GetPlace(),
+                       info_ptr,
+                       sizeof(int) * batch_count,
+                       dev_ctx.stream());
+
+  for (int i = 0; i < batch_count; ++i) {
+    PADDLE_ENFORCE_EQ(error_info[i],
+                      0,
+                      errors::PreconditionNotMet(
+                          "For batch [%d]: U(%d, %d) is zero, singular U.",
+                          i,
+                          error_info[i],
+                          error_info[i]));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cholesky,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskyKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ad694445d1874c7442ae320c89276000201d56e4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/complex_grad_kernel.h"
+#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(imag_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ImagGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(real_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RealGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index 47a43ee9910b8579529128115652f3321ef3496a..d0b086718a4446056794226a361f469e5740df4a 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -21,7 +21,7 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(conj,
+PD_REGISTER_KERNEL(conj,
                    GPU,
                    ALL_LAYOUT,
                    phi::ConjKernel,
@@ -32,3 +32,17 @@ PT_REGISTER_KERNEL(conj,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(real,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RealKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(imag,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ImagKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/concat_and_split.h b/paddle/phi/kernels/gpu/concat_and_split.h
deleted file mode 100644
index 46586012ccc1efc488d815471d0be5c87109ca6c..0000000000000000000000000000000000000000
--- a/paddle/phi/kernels/gpu/concat_and_split.h
+++ /dev/null
@@ -1,568 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
-namespace phi {
-
-template <typename T>
-__global__ void ConcatKernel_(const T** inputs,
-                              const int64_t* input_cols,
-                              int col_size,
-                              const int64_t output_rows,
-                              const int64_t output_cols,
-                              T* output) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int curr_segment = 0;
-  int curr_offset = input_cols[0];
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int curr_col_offset = input_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = input_cols[curr_segment + 1];
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-
-    const T* input_ptr = inputs[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * segment_width + local_col];
-  }
-}
-
-template <typename T>
-__device__ void ConcatKernelDetail(const T** inputs_data,
-                                   const int fixed_in_col,
-                                   const int out_rows,
-                                   const int out_cols,
-                                   T* output_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * 1.0 / fixed_in_col;
-    int in_offset = tid_x - split * fixed_in_col;
-    const T* input_ptr = inputs_data[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
-      output_data[tid_y * out_cols + tid_x] =
-          input_ptr[tid_y * fixed_in_col + in_offset];
-    }
-  }
-}
-
-template <typename T>
-__global__ void ConcatKernel_(const T* input_addr0,
-                              const T* input_addr1,
-                              const int64_t fixed_in_col,
-                              const int64_t out_rows,
-                              const int64_t out_cols,
-                              T* output_data) {
-  const T* inputs_data[2];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel_(const T* input_addr0,
-                              const T* input_addr1,
-                              const T* input_addr2,
-                              const int64_t fixed_in_col,
-                              const int64_t out_rows,
-                              const int64_t out_cols,
-                              T* output_data) {
-  const T* inputs_data[3];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel_(const T* input_addr0,
-                              const T* input_addr1,
-                              const T* input_addr2,
-                              const T* input_addr3,
-                              const int64_t fixed_in_col,
-                              const int64_t out_rows,
-                              const int64_t out_cols,
-                              T* output_data) {
-  const T* inputs_data[4];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  inputs_data[3] = input_addr3;
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel_(const T** inputs_data,
-                              const int in_num,
-                              const int64_t fixed_in_col,
-                              const int64_t out_rows,
-                              const int64_t out_cols,
-                              T* output_data) {
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t* out_cols,
-                             int out_cols_size,
-                             T** outputs_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int curr_segment = 0;
-  int curr_offset = out_cols[0];
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    int curr_col_offset = out_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = out_cols[curr_segment + 1];
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs_data[curr_segment];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * segment_width + local_col] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__device__ void SplitKernelDetail(const T* input_data,
-                                  const int in_row,
-                                  const int in_col,
-                                  const int fixed_out_col,
-                                  T** outputs_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x / fixed_out_col;
-    int in_offset = tid_x - split * fixed_out_col;
-    T* output_ptr = outputs_data[split];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * fixed_out_col + in_offset] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T** outputs_data) {
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1) {
-  T* outputs_data[2];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1,
-                             T* outputs_addr2) {
-  T* outputs_data[3];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1,
-                             T* outputs_addr2,
-                             T* outputs_addr3) {
-  T* outputs_data[4];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  outputs_data[3] = outputs_addr3;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-static inline void GetBlockDims(const phi::GPUContext& context,
-                                int64_t num_rows,
-                                int64_t num_cols,
-                                dim3* block_dims,
-                                dim3* grid_dims) {
-  // Set the thread block and grid according to CurrentDeviceId
-  const int kThreadsPerBlock = 1024;
-  int block_cols = kThreadsPerBlock;
-  if (num_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-    block_cols = ((num_cols + 31) >> 5) << 5;
-  }
-  int block_rows = kThreadsPerBlock / block_cols;
-  *block_dims = dim3(block_cols, block_rows, 1);
-
-  int max_threads = context.GetMaxPhysicalThreadCount();
-  int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-  int grid_cols =
-      std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows = std::min(max_blocks / grid_cols,
-                           std::max(num_rows / block_rows, (int64_t)1));
-  *grid_dims = dim3(grid_cols, grid_rows, 1);
-}
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T, typename Context>
-void ConcatImpl(const Context& context,
-                const std::vector<phi::DenseTensor>& input,
-                int axis,
-                phi::DenseTensor* output) {
-  // TODO(zcd): Add input data validity checking
-  int in_num = input.size();
-  int64_t in_row = 1;
-  auto dim_0 = input[0].dims();
-  for (int i = 0; i < axis; ++i) {
-    in_row *= dim_0[i];
-  }
-  int64_t in_col = input[0].numel() / in_row;
-  int64_t out_row = in_row, out_col = 0;
-
-  int inputs_col_num = in_num + 1;
-  std::vector<const T*> inputs_data_vec(in_num);
-  std::vector<int64_t> inputs_col_vec(inputs_col_num);
-  const T** inputs_data = inputs_data_vec.data();
-  int64_t* inputs_col = inputs_col_vec.data();
-
-// There are some differences between hip runtime and NV runtime.
-// In NV, when the pageable memory data less than 64K is transferred from
-// hosttodevice, it will be automatically asynchronous.
-// However, only pinned memory in hip can copy asynchronously
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-// 3.2.6.1. Concurrent Execution between Host and Device
-// Memory copies from host to device of a memory block of 64 KB or less
-#ifdef PADDLE_WITH_HIP
-  paddle::memory::AllocationPtr data_alloc, col_alloc;
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     in_num * sizeof(T*));
-  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                    inputs_col_num * sizeof(int));
-  inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
-#endif
-
-  inputs_col[0] = 0;
-  bool has_same_shape = true;
-  for (int i = 0; i < in_num; ++i) {
-    int64_t t_cols = input[i].numel() / in_row;
-    if (has_same_shape) {
-      if (t_cols != in_col) has_same_shape = false;
-    }
-    out_col += t_cols;
-    inputs_col[i + 1] = out_col;
-    inputs_data[i] = input[i].data<T>();
-  }
-
-  dim3 block_dims;
-  dim3 grid_dims;
-  GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
-
-  paddle::memory::allocation::AllocationPtr tmp_dev_ins_data;
-  const T** dev_ins_data = nullptr;
-  if (!has_same_shape || in_num < 2 || in_num > 4) {
-    tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*));
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        inputs_data, in_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_ins_data->ptr(),
-                         paddle::platform::CPUPlace(),
-                         restored,
-                         in_num * sizeof(T*),
-                         context.stream());
-    dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
-  }
-
-  if (has_same_shape) {
-    if (in_num == 2) {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          inputs_data[0],
-          inputs_data[1],
-          in_col,
-          out_row,
-          out_col,
-          output->data<T>());
-    } else if (in_num == 3) {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          inputs_data[0],
-          inputs_data[1],
-          inputs_data[2],
-          in_col,
-          out_row,
-          out_col,
-          output->data<T>());
-    } else if (in_num == 4) {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          inputs_data[0],
-          inputs_data[1],
-          inputs_data[2],
-          inputs_data[3],
-          in_col,
-          out_row,
-          out_col,
-          output->data<T>());
-    } else {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
-    }
-  } else {
-    auto tmp_dev_ins_col_data =
-        paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t));
-
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        inputs_col, inputs_col_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_ins_col_data->ptr(),
-                         paddle::platform::CPUPlace(),
-                         restored,
-                         inputs_col_num * sizeof(int64_t),
-                         context.stream());
-    int64_t* dev_ins_col_data =
-        static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-
-    ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-        dev_ins_data,
-        dev_ins_col_data,
-        static_cast<int>(inputs_col_num),
-        out_row,
-        out_col,
-        output->data<T>());
-  }
-
-#ifdef PADDLE_WITH_HIP
-  // Prevent the pinned memory value from being covered and release the memory
-  // after the launch kernel of the stream is executed (reapply pinned memory
-  // next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* col_alloc_released = col_alloc.release();
-  context.AddStreamCallback([data_alloc_released, col_alloc_released] {
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        data_alloc_released);
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        col_alloc_released);
-  });
-#endif
-}
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T, typename Context>
-void SplitImpl(const Context& context,
-               const phi::DenseTensor& input,
-               const std::vector<const phi::DenseTensor*>& ref_inputs,
-               int axis,
-               std::vector<phi::DenseTensor*>* outputs) {
-  // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-  // tensors of shape [0,1,4]
-  if (input.numel() == 0) {
-    return;
-  }
-
-  // TODO(zcd): Add input data validity checking
-  int o_num = outputs->size();
-  int64_t out_row = 1;
-  auto dim_0 = ref_inputs[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    out_row *= dim_0[i];
-  }
-
-  int64_t out0_col = ref_inputs[0]->numel() / out_row;
-  int64_t in_col = 0, in_row = out_row;
-  bool has_same_shape = true;
-
-  int outputs_cols_num = o_num + 1;
-  std::vector<T*> outputs_data_vec(o_num);
-  std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
-  T** outputs_data = outputs_data_vec.data();
-  int64_t* outputs_cols = outputs_cols_vec.data();
-
-// There are some differences between hip runtime and NV runtime.
-// In NV, when the pageable memory data less than 64K is transferred from
-// hosttodevice, it will be automatically asynchronous.
-// However, only pinned memory in hip can copy asynchronously
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-// 3.2.6.1. Concurrent Execution between Host and Device
-// Memory copies from host to device of a memory block of 64 KB or less
-#ifdef PADDLE_WITH_HIP
-  paddle::memory::AllocationPtr data_alloc, cols_alloc;
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     o_num * sizeof(T*));
-  outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     (outputs_cols_num) * sizeof(int64_t));
-  outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
-#endif
-
-  outputs_cols[0] = 0;
-  for (int i = 0; i < o_num; ++i) {
-    int64_t t_col = ref_inputs.at(i)->numel() / out_row;
-    if (has_same_shape) {
-      if (t_col != out0_col) has_same_shape = false;
-    }
-    in_col += t_col;
-    outputs_cols[i + 1] = in_col;
-    if (outputs->at(i) != nullptr) {
-      outputs_data[i] = outputs->at(i)->data<T>();
-    } else {
-      outputs_data[i] = nullptr;
-    }
-  }
-
-  dim3 block_dims;
-  dim3 grid_dims;
-  GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
-
-  paddle::memory::allocation::AllocationPtr tmp_dev_outs_data;
-  T** dev_out_gpu_data = nullptr;
-  if (!has_same_shape || o_num < 2 || o_num > 4) {
-    // TODO(chentianyu03): try to find a method to remove the Alloc function
-    tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*));
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        outputs_data, o_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_outs_data->ptr(),
-                         paddle::platform::CPUPlace(),
-                         restored,
-                         o_num * sizeof(T*),
-                         context.stream());
-    dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
-  }
-
-  if (has_same_shape) {
-    if (o_num == 2) {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          out0_col,
-          outputs_data[0],
-          outputs_data[1]);
-    } else if (o_num == 3) {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          out0_col,
-          outputs_data[0],
-          outputs_data[1],
-          outputs_data[2]);
-    } else if (o_num == 4) {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          out0_col,
-          outputs_data[0],
-          outputs_data[1],
-          outputs_data[2],
-          outputs_data[3]);
-    } else {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
-    }
-  } else {
-    auto tmp_dev_ins_col_data =
-        // TODO(chentianyu03): try to find a method to remove the Alloc function
-        paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        outputs_cols, outputs_cols_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_ins_col_data->ptr(),
-                         paddle::platform::CPUPlace(),
-                         restored,
-                         outputs_cols_num * sizeof(int64_t),
-                         context.stream());
-    int64_t* dev_outs_col_data =
-        reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-
-    SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-        input.data<T>(),
-        in_row,
-        in_col,
-        dev_outs_col_data,
-        static_cast<int>(outputs_cols_num),
-        dev_out_gpu_data);
-  }
-#ifdef PADDLE_WITH_HIP
-  // Prevent the pinned memory value from being covered and release the memory
-  // after the launch kernel of the stream is executed (reapply pinned memory
-  // next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* cols_alloc_released = cols_alloc.release();
-  context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        data_alloc_released);
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        cols_alloc_released);
-  });
-#endif
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index 22faeaf41970083d970903b64808b638f2115931..2b04b979c20aa71cc723610d013cd12fb5537a29 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -22,8 +22,8 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/lod_utils.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
-#include "paddle/phi/kernels/gpu/concat_and_split.h"
 
 namespace phi {
 
@@ -54,7 +54,7 @@ void ConcatKernel(const Context& dev_ctx,
         PADDLE_ENFORCE_EQ(
             x[i].lod().size(),
             lod_size_0,
-            paddle::platform::errors::Unimplemented(
+            phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
                 "Maybe different lod level of input LoDTensors can concat,"
                 "it is not supported currently. The lod level of %dth input "
@@ -104,13 +104,14 @@ void ConcatKernel(const Context& dev_ctx,
         continue;
       }
     }
-    ConcatImpl<T, Context>(dev_ctx, inputs, axis, out);
+    phi::funcs::ConcatFunctor<Context, T> functor;
+    functor(dev_ctx, inputs, axis, out);
   }
 }
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(concat,
+PD_REGISTER_KERNEL(concat,
                    GPU,
                    ALL_LAYOUT,
                    phi::ConcatKernel,
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index 58b0a31d1d6d54e291339e59d4ab0da7ef09e68b..4545f9ce436ea4028d43d3a91ae46a21cde41bb5 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -28,14 +28,14 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
           const DenseTensor& src,
+          Place dst_place,
           bool blocking,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
   const auto& src_place = src.place();
-  auto dst_place = dst->place();
 
   if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "The src and dst tensor are all CPU tensor, you should call copy "
         "function in CPU mode."));
   }
@@ -43,8 +43,14 @@ void Copy(const Context& dev_ctx,
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
 
-  dst->ResizeAndAllocate(src.dims());
-  auto* dst_ptr = dst->mutable_data(dst_place);
+  dst->Resize(src.dims());
+
+  void* dst_ptr = nullptr;
+  if (paddle::platform::is_cpu_place(dst_place)) {
+    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+  } else {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  }
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
@@ -57,30 +63,21 @@ void Copy(const Context& dev_ctx,
 
   auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
 
-  if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
-      paddle::platform::is_cuda_pinned_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
+  if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+      paddle::platform::is_cpu_place(dst_place)) {
     auto src_gpu_place = src_place;
     auto dst_cpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        paddle::platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place,
                       ctx_gpu_place,
-                      paddle::platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                           "Source place and context place do not match, source "
                           "place is %s, context place is %s.",
                           src_gpu_place,
@@ -98,13 +95,13 @@ void Copy(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        paddle::platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_gpu_place,
                       ctx_gpu_place,
-                      paddle::platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                           "Destination place and context place do not match, "
                           "destination place is %s, context place is %s.",
                           dst_gpu_place,
@@ -114,56 +111,6 @@ void Copy(const Context& dev_ctx,
                  : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
-    auto src_gpu_place = src_place;
-    auto dst_cuda_pinned_place = dst_place;
-    auto ctx_place = dev_ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
-                      true,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from GPU memory to CUDA Pinned memory, current "
-                          "device context place should be GPU."));
-    auto ctx_gpu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(src_gpu_place,
-                      ctx_gpu_place,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "The source GPU device and current device context do "
-                          "not match. The source GPU device number is %d, but "
-                          "device context GPU number is %d.",
-                          src_gpu_place.device,
-                          ctx_gpu_place.device));
-    auto stream =
-        blocking ? nullptr
-                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
-        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
-             paddle::platform::is_gpu_place(dst_place)) {
-    auto src_cuda_pinned_place = src_place;
-    auto dst_gpu_place = dst_place;
-    auto ctx_place = dev_ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
-                      true,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from CUDA Pinned memory to GPU memory, current "
-                          "device context place should be GPU."));
-    auto ctx_gpu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(dst_gpu_place,
-                      ctx_gpu_place,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "The target GPU device and current device context do "
-                          "not match. The target GPU device number is %d, but "
-                          "device context GPU number is %d.",
-                          dst_gpu_place.device,
-                          ctx_gpu_place.device));
-    auto stream =
-        blocking ? nullptr
-                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
-        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
   } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
              paddle::platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = src_place;
@@ -172,7 +119,7 @@ void Copy(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        paddle::platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto stream =
@@ -195,17 +142,17 @@ void Copy(const Context& dev_ctx,
         paddle::memory::Copy(
             dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
       } else {
-        PADDLE_THROW(paddle::platform::errors::Unavailable(
+        PADDLE_THROW(phi::errors::Unavailable(
             "Context place dose not match the source and destination place."));
       }
     }
   } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Place type error. Please check the place of src and dst Tensor."));
   }
 }
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     copy, GPU, ALL_LAYOUT, phi::Copy<phi::GPUContext>, ALL_DTYPE) {}
diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1bb0d42dad81af005c58bd3819aadab5828475af
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cross_grad_kernel.h"
+#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cross_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CrossGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa944f82916745a1a6584d76beffa7557be7a0cb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cross_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cross_kernel.h"
+#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    cross, GPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fc70639787173d84b69262245dbb0500aa179a90
--- /dev/null
+++ b/paddle/phi/kernels/gpu/diag_kernel.cu
@@ -0,0 +1,134 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_kernel.h"
+
+#include <algorithm>
+#include <tuple>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+// Extract the diagonal of a matrix 'x' to a vector 'out'.
+template <typename T>
+__global__ void ExtractDiagonalKernel(T* out,
+                                      const T* x,
+                                      std::ptrdiff_t start,
+                                      std::ptrdiff_t size,
+                                      const std::ptrdiff_t sumStride,
+                                      const std::ptrdiff_t outStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t xOffset = start + sumStride * idx;
+    out[outStride * idx] = x[xOffset];
+  }
+}
+
+// Paste a vector 'x' to the diagonal of a matrix 'out'
+template <typename T>
+__global__ void PasteDiagonalKernel(T* out,
+                                    const T* x,
+                                    std::ptrdiff_t start,
+                                    std::ptrdiff_t x_length,
+                                    const std::ptrdiff_t sumStride,
+                                    const std::ptrdiff_t xStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < x_length;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t outOffset = start + sumStride * idx;
+    out[outOffset] = x[xStride * idx];
+  }
+}
+
+template <typename T, typename Context>
+void DiagKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int offset,
+                float padding_value,
+                DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  auto x_dims = x.dims();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  auto out_dims = out->dims();
+
+  auto GetBlockGridSize = [&dev_ctx](int64_t size) {
+    const int64_t block_size =
+        std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
+    int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (size + block_size - 1) / block_size);
+    return std::tuple<int64_t, int64_t>{block_size, grid_size};
+  };
+
+  if (x_dims.size() == 1) {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+    auto x_length = x_dims[0];
+    auto size = (offset > 0) ? x_length + offset : x_length - offset;
+    const int& x_stride = phi::funcs::ComputeStride(0, x_dims);
+    if (size > 0) {
+      const auto& out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
+      const auto& out_stride_1 = phi::funcs::ComputeStride(1, out_dims);
+      auto start =
+          (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+      PasteDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                               std::get<0>(block_grid_size),
+                               0,
+                               dev_ctx.stream()>>>(out_data,
+                                                   x_data,
+                                                   start,
+                                                   x_length,
+                                                   out_stride_0 + out_stride_1,
+                                                   x_stride);
+    }
+  } else {
+    const int& x_stride_0 = phi::funcs::ComputeStride(0, x_dims);
+    const int& x_stride_1 = phi::funcs::ComputeStride(1, x_dims);
+
+    int64_t size;
+    if (offset > 0) {
+      size = std::min(x_dims[0], x_dims[1] - offset);
+    } else {
+      size = std::min(x_dims[0] + offset, x_dims[1]);
+    }
+
+    if (size > 0) {
+      auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+      const auto& out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
+
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+      ExtractDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                                 std::get<0>(block_grid_size),
+                                 0,
+                                 dev_ctx.stream()>>>(
+          out_data, x_data, start, size, x_stride_0 + x_stride_1, out_stride_0);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    diag, GPU, ALL_LAYOUT, phi::DiagKernel, int, int64_t, float, double) {}
diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
index 599fa2842a974e737c3d095b48f6e49f13578218..423093728e9d62386832d38a8db7caa5984e07d3 100644
--- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
@@ -158,7 +158,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PT_REGISTER_KERNEL(diagonal_grad,
+PD_REGISTER_KERNEL(diagonal_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::DiagonalGradKernel,
diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu
index c4b61cf819f84464d5f2b3e19a9a9c25b1908207..58da29b2224a615234634f4e853089f5f51e2dcd 100644
--- a/paddle/phi/kernels/gpu/diagonal_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu
@@ -154,7 +154,7 @@ void DiagonalKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PT_REGISTER_KERNEL(diagonal,
+PD_REGISTER_KERNEL(diagonal,
                    GPU,
                    ALL_LAYOUT,
                    phi::DiagonalKernel,
diff --git a/paddle/phi/kernels/gpu/digamma_grad_kernel.cu b/paddle/phi/kernels/gpu/digamma_grad_kernel.cu
index 54a618fe0421e4b6ecdda8d4ee3f4174ab5aeb5b..695227bba0f71d8c40e730a30998235c7c756442 100644
--- a/paddle/phi/kernels/gpu/digamma_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/digamma_grad_kernel.cu
@@ -18,5 +18,5 @@
 #include "paddle/phi/kernels/digamma_grad_kernel.h"
 #include "paddle/phi/kernels/impl/digamma_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma_grad, GPU, ALL_LAYOUT, phi::DigammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/digamma_kernel.cu b/paddle/phi/kernels/gpu/digamma_kernel.cu
index 91d63eeab8c83e72d965045b00e3df7005a27469..381c22a82e863d08d92b9fd5a9824fb9678ef2fa 100644
--- a/paddle/phi/kernels/gpu/digamma_kernel.cu
+++ b/paddle/phi/kernels/gpu/digamma_kernel.cu
@@ -19,5 +19,5 @@
 #include "paddle/phi/kernels/digamma_kernel.h"
 #include "paddle/phi/kernels/impl/digamma_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma, GPU, ALL_LAYOUT, phi::DigammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/dot_grad_kernel.cu b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
index 3290dba3d45b9789722f8be4859d1c28a8ba66c7..7defc0304e511e2002c9b904226792636212a214 100644
--- a/paddle/phi/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(dot_grad,
+PD_REGISTER_KERNEL(dot_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::DotGradKernel,
diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
index 9f3c3ff794abaef4464505566cda38f83c2e79bc..4442396f6c9dd752d044a1f540b673546944586c 100644
--- a/paddle/phi/kernels/gpu/dot_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_kernel.cu
@@ -52,7 +52,7 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(dot,
+PD_REGISTER_KERNEL(dot,
                    GPU,
                    ALL_LAYOUT,
                    phi::DotKernel,
diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/gpu/elementwise.h
index df66a00a8072569ed2b3e4e01ab27bbda84598d5..12cafc7023bb5100d5f619aeec29a357a13e4935 100644
--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/gpu/elementwise.h
@@ -114,6 +114,7 @@ inline void ComputeBroadcastKernelSize(int *x_dims_array,
   }
 }
 
+#ifndef __xpu__
 template <typename T, typename OP, typename Tout = T>
 static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
                                                             const T *y,
@@ -128,8 +129,8 @@ static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
                                                             bool is_xsize,
                                                             OP op,
                                                             T *dd) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
+  int tid = THREAD_ID_X;
+  int bid = BLOCK_ID_X;
 
   T val(0);
   if (is_xsize) {
@@ -196,8 +197,8 @@ static __global__ void FastCommonGradBroadcastAllCUDAKernel(
     DY_OP dy_op,
     T *dx,
     T *dy) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
+  int tid = THREAD_ID_X;
+  int bid = BLOCK_ID_X;
 
   T val(0);
   if (is_xsize_larger) {
@@ -260,67 +261,67 @@ static __global__ void FastCommonGradBroadcastCUDAKernelHeight(const T *x,
   __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
 
   T val(0);
-  size_t width_stride = gridDim.x * blockDim.x;
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t width_stride = GRID_NUM_X * BLOCK_NUM_X;
+  size_t idx = THREAD_ID_X + BLOCK_NUM_X * BLOCK_ID_X;
   size_t full_width =
       (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
   size_t full_height =
       (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
   if (is_y) {
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int out_offset = n * w + m;
         int x_offset = (n % x_h) * x_w + m % x_w;
         if (dy) {
           if (m < w && n < h) {
             T val = dy_op(x[x_offset], y[m], out[out_offset], dout[out_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1) {
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         }
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dy[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
   } else {
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int out_offset = n * w + m;
         int y_offset = (n % x_h) * x_w + m % x_w;
         if (dy) {
           if (m < w && n < h) {
             T val = dy_op(x[m], y[y_offset], out[out_offset], dout[out_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1) {
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         }
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dy[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
@@ -339,9 +340,9 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
                                                             int x_h,
                                                             int x_w,
                                                             bool is_y) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
+  int j = BLOCK_ID_X;
+  int i = THREAD_ID_X;
+  int tid = THREAD_ID_X;
   T val(0);
 
   if (is_y) {
@@ -357,7 +358,7 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
     if (dy) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -374,7 +375,7 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
     if (dy) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -393,9 +394,9 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
                                                         DY_OP dy_op,
                                                         T *dx,
                                                         T *dy) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
+  int j = BLOCK_ID_X;
+  int i = THREAD_ID_X;
+  int tid = THREAD_ID_X;
   T val(0);
   if (is_xsize_larger) {
     do {
@@ -412,7 +413,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
     if (dy) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -431,7 +432,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
     if (dx) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dx[j] = val;
       }
     }
@@ -456,16 +457,16 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
   __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
 
   T val(0);
-  size_t width_stride = gridDim.x * blockDim.x;
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t width_stride = GRID_NUM_X * BLOCK_NUM_X;
+  size_t idx = THREAD_ID_X + BLOCK_NUM_X * BLOCK_ID_X;
   size_t full_width =
       (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
   size_t full_height =
       (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
   if (is_xsize_larger) {
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int x_offset = n * w + m;
         if (dx && m < w && n < h) {
           dx[x_offset] =
@@ -474,29 +475,29 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
         if (dy) {
           if (m < w && n < h) {
             T val = dy_op(x[x_offset], y[m], out[x_offset], dout[x_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1)
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dy[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
   } else {  // x.dims < y.dims, broadcast for x.
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int y_offset = n * w + m;
         if (dy && m < w && n < h) {
           dy[y_offset] =
@@ -505,22 +506,22 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
         if (dx) {
           if (m < w && n < h) {
             T val = dx_op(x[m], y[y_offset], out[y_offset], dout[y_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dx) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1)
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dx[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dx[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
@@ -540,8 +541,8 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
                                                         DY_OP dy_op,
                                                         T *dx,
                                                         T *dy) {
-  int tid = threadIdx.x;
-  int j = blockIdx.x;
+  int tid = THREAD_ID_X;
+  int j = BLOCK_ID_X;
 
   T val(0);
   int ttid = tid;
@@ -569,7 +570,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
       int h = pre * post;
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -596,7 +597,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
       int h = pre * post;
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dx[j] = val;
       }
     }
@@ -668,9 +669,9 @@ __global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array,
                                               int thread_num,
                                               DX_OP dx_op) {
   T val(0);
-  int i = blockIdx.x;
-  int tid = threadIdx.x;
-  for (int j = tid; j < thread_num; j += blockDim.x) {
+  int i = BLOCK_ID_X;
+  int tid = THREAD_ID_X;
+  for (int j = tid; j < thread_num; j += BLOCK_NUM_X) {
     const int X_index = i * thread_num + j;
     int out_index = X_index;
     int C_index = 0;
@@ -694,7 +695,7 @@ __global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array,
     val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]);
   }
   val = paddle::platform::reduceSum(val, tid, thread_num);
-  if (threadIdx.x == 0) {
+  if (THREAD_ID_X == 0) {
     dx[i] = val;
   }
 }
@@ -714,7 +715,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                              DX_OP dx_op,
                              DY_OP dy_op) {
   const auto gplace = ctx.GetPlace();
-  auto cplace = paddle::platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
   const T *x_data = x.data<T>();
   const T *y_data = y.data<T>();
   const Tout *out_data = out.data<Tout>();
@@ -1339,12 +1340,12 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
@@ -1416,8 +1417,8 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
 template <typename T>
 static __global__ void SimpleElemwiseAddGradCUDAKernel(
     const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = gridDim.x * blockDim.x;
+  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+  int stride = GRID_NUM_X * BLOCK_NUM_X;
   int loop = size / vec_size;
   int remainder = size % vec_size;
   const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
@@ -1459,7 +1460,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
     auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
     if (dx->dims() == dout.dims()) {
       if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, false, dx);
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
       }
     } else {
       // For inplace strategy, dx will be stored in addr of dout, which makes
@@ -1480,7 +1481,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
     auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
     if (dy->dims() == dout.dims()) {
       if (dy_data != dout_data) {
-        phi::Copy(ctx, dout, false, dy);
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
       }
     } else {
       std::vector<int> reduce_dims =
@@ -1506,11 +1507,11 @@ void elementwise_add_grad(const GPUContext &ctx,
   if (dx_data == dout_data && dy_data != dout_data) {
     VLOG(4) << "Special case when dx_data is the same as dout_data, "
                "only need copy dout to dy";
-    phi::Copy(ctx, dout, false, dy);
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
   } else if (dx_data != dout_data && dy_data == dout_data) {
     VLOG(4) << "Special case when dy_data is the same as dout_data, "
                "only need copy dout to dx";
-    phi::Copy(ctx, dout, false, dx);
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
   } else if (dx_data != dout_data && dy_data != dout_data) {
     auto size = x.numel();
     int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
@@ -1544,14 +1545,14 @@ static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
                                                        int64_t size,
                                                        T *dx,
                                                        T *dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
 
   while (col < size) {
     if (dx != nullptr) {
       dx[col] = dout[col];
     }
     dy[col] = -dout[col];
-    col += blockDim.x * gridDim.x;
+    col += BLOCK_NUM_X * GRID_NUM_X;
   }
 }
 
@@ -1570,7 +1571,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
     auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
     if (dx->dims() == dout.dims()) {
       if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, false, dx);
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
       }
     } else {
       // For inplace strategy, dx will be stored in addr of dout, which makes
@@ -1629,4 +1630,6 @@ void elementwise_sub_grad(const GPUContext &ctx,
       dy->mutable_data<T>(ctx.GetPlace()));
 }
 
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index fc78fe88c2e0e1201542765aed8684ddff9c7697..3c4c01b1dc8ff739ac87ca2e9fe7a6659ab4eac3 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -119,7 +119,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(add_grad,
+PD_REGISTER_KERNEL(add_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddGradKernel,
@@ -128,10 +128,11 @@ PT_REGISTER_KERNEL(add_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_double_grad,
+PD_REGISTER_KERNEL(add_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddDoubleGradKernel,
@@ -140,10 +141,11 @@ PT_REGISTER_KERNEL(add_double_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_triple_grad,
+PD_REGISTER_KERNEL(add_triple_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddTripleGradKernel,
@@ -152,10 +154,11 @@ PT_REGISTER_KERNEL(add_triple_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_grad,
+PD_REGISTER_KERNEL(subtract_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractGradKernel,
@@ -164,10 +167,11 @@ PT_REGISTER_KERNEL(subtract_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_double_grad,
+PD_REGISTER_KERNEL(subtract_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractDoubleGradKernel,
@@ -176,5 +180,6 @@ PT_REGISTER_KERNEL(subtract_double_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..50fbfddf0432e38068dd1fe529a6b26f4fdd788b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/erfinv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    erfinv_grad, GPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/erfinv_kernel.cu b/paddle/phi/kernels/gpu/erfinv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..10df0bdf5603c927dba7631e07096ac9cf2aeb50
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erfinv_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/erfinv_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(erfinv, GPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/expand_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
index 9ee58ad6caf29c7ff150f60627c772e166cb36a8..8e2c3fde04a6a0038f457cfe74561719bba7f069 100644
--- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/kernels/expand_grad_kernel.h"
 #include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand_grad,
+PD_REGISTER_KERNEL(expand_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::ExpandGradKernel,
diff --git a/paddle/phi/kernels/gpu/expand_kernel.cu b/paddle/phi/kernels/gpu/expand_kernel.cu
index dc1b4717fcc4c80809546acfb4cc8510d3e67c82..d4275804b3db8f4ae246379fe562d57598a2100d 100644
--- a/paddle/phi/kernels/gpu/expand_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/impl/expand_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand,
+PD_REGISTER_KERNEL(expand,
                    GPU,
                    ALL_LAYOUT,
                    phi::ExpandKernel,
diff --git a/paddle/phi/kernels/gpu/eye_kernel.cu b/paddle/phi/kernels/gpu/eye_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..069310b0d156271079ab76ddfb4d8ff88400be78
--- /dev/null
+++ b/paddle/phi/kernels/gpu/eye_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eye_kernel.h"
+#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(eye,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EyeKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..668d673bd3269c965451e6bf51dd59daf3c54452
--- /dev/null
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/flip_kernel.h"
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void flip_cuda_kernel(const int N,
+                                 const T* in_data,
+                                 T* out_data,
+                                 int64_t* x_shape,
+                                 int64_t* x_stride,
+                                 int* flip_dims,
+                                 int flip_dims_size,
+                                 int total_dims) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int cur_indices = idx, rem = 0, dst_offset = 0;
+  for (int i = 0; i < total_dims; ++i) {
+    int64_t temp = cur_indices;
+    cur_indices = cur_indices / x_stride[i];
+    rem = temp - cur_indices * x_stride[i];
+    // flip the indices if it is in flip_dims
+    for (int j = 0; j < flip_dims_size; ++j) {
+      if (i == flip_dims[j]) {
+        cur_indices = x_shape[i] - 1 - cur_indices;
+      }
+    }
+    dst_offset += cur_indices * x_stride[i];
+    cur_indices = rem;
+  }
+  out_data[idx] = in_data[dst_offset];
+}
+
+template <typename T, typename Context>
+void FlipKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int>& axis,
+                DenseTensor* out) {
+  const auto gplace = dev_ctx.GetPlace();
+  auto cplace = phi::CPUPlace();
+  std::vector<int> flip_dims = axis;
+
+  auto* in_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+
+  const int flip_dims_size = static_cast<int>(flip_dims.size());
+  auto x_dims = x.dims();
+  const int total_dims = x_dims.size();
+  const int N = x.numel();
+
+  int block_size = 512;
+  dim3 dim_block(block_size);
+  dim3 dim_grid((N + block_size - 1) / block_size);
+
+  for (size_t i = 0; i < flip_dims.size(); ++i) {
+    if (flip_dims[i] < 0) {
+      flip_dims[i] += total_dims;
+    }
+  }
+
+  auto x_stride = phi::stride(x_dims);
+  std::vector<int64_t> x_dims_v = phi::vectorize(x_dims);
+  std::vector<int64_t> x_stride_v = phi::vectorize(x_stride);
+
+  int bytes = total_dims * sizeof(int64_t);
+  auto x_strides_array_tmp = paddle::memory::Alloc(dev_ctx, bytes);
+  int64_t* x_strides_array_gpu =
+      reinterpret_cast<int64_t*>(x_strides_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       x_strides_array_gpu,
+                       cplace,
+                       x_stride_v.data(),
+                       bytes,
+                       dev_ctx.stream());
+
+  auto x_shape_array_tmp = paddle::memory::Alloc(dev_ctx, bytes);
+  int64_t* x_shape_array_gpu =
+      reinterpret_cast<int64_t*>(x_shape_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       x_shape_array_gpu,
+                       cplace,
+                       x_dims_v.data(),
+                       bytes,
+                       dev_ctx.stream());
+
+  bytes = flip_dims_size * sizeof(int);
+  auto flip_dims_array_tmp = paddle::memory::Alloc(dev_ctx, bytes);
+  int* flip_dims_array_gpu = reinterpret_cast<int*>(flip_dims_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       flip_dims_array_gpu,
+                       cplace,
+                       flip_dims.data(),
+                       bytes,
+                       dev_ctx.stream());
+
+  flip_cuda_kernel<T><<<dim_grid, dim_block, 0, dev_ctx.stream()>>>(
+      N,
+      in_data,
+      out_data,
+      x_shape_array_gpu,
+      x_strides_array_gpu,
+      flip_dims_array_gpu,
+      flip_dims_size,
+      total_dims);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(flip,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FlipKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index caa05514c4f0fa2f6b0f78951dae49d7d4710b66..1f756bfdbed30a18aebfabfb0810436406c87204 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -41,7 +41,7 @@ void FullKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   out->Resize(phi::make_ddim(shape.GetData()));
   int numel = out->numel();
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   if (numel > 0) {
     // in transformer model the numel of outpout will be zero.
     std::vector<const DenseTensor*> inputs = {};
@@ -85,7 +85,7 @@ void FullLikeKernel(const Context& dev_ctx,
           static_cast<float>(value)));
   std::vector<const DenseTensor*> inputs = {};
   std::vector<DenseTensor*> outputs = {out};
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   // This function has no input, so the inputs.size() == 0. Use kUnary, but the
   // data will not be loaded in the kernel because the number of parameters in
   // the operator is 0
@@ -98,7 +98,7 @@ void FullLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full,
+PD_REGISTER_KERNEL(full,
                    GPU,
                    ALL_LAYOUT,
                    phi::FullKernel,
@@ -113,7 +113,7 @@ PT_REGISTER_KERNEL(full,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(full_like,
+PD_REGISTER_KERNEL(full_like,
                    GPU,
                    ALL_LAYOUT,
                    phi::FullLikeKernel,
@@ -123,4 +123,6 @@ PT_REGISTER_KERNEL(full_like,
                    int,
                    int64_t,
                    bool,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a28a7512f49862d21c83029bb05ff13bacdb995e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(gumbel_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GumbelSoftmaxGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b1e58981baa0a4768057b5a1c072d4182dfc1fd
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -0,0 +1,181 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gumbel_softmax_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+template <typename T>
+struct UniformCUDAGenerator {
+  T min_, max_;
+  unsigned int seed_;
+  unsigned int offset_ = 0;
+  HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed)
+      : min_(min), max_(max), seed_(seed) {}
+  HOSTDEVICE UniformCUDAGenerator(T min,
+                                  T max,
+                                  unsigned int seed,
+                                  unsigned int offset)
+      : min_(min), max_(max), seed_(seed), offset_(offset) {}
+
+  HOSTDEVICE T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    return dist(rng);
+  }
+};
+
+template <typename T, size_t BlockDim>
+__global__ void OneHotCUDAKernel(const int64_t height,
+                                 const int64_t width,
+                                 const int64_t size_out_axis,
+                                 const T init,
+                                 const T* in,
+                                 T* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int64_t idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / size_out_axis;
+    int w = idx % size_out_axis;
+    cub::ArgMax reducer;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair = reducer(
+          {k, in[h * width * size_out_axis + k * size_out_axis + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      int index = static_cast<int>(kv_pair.key);
+      out[h * width * size_out_axis + index * size_out_axis + w] = 1;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T>
+struct OneHotGenerator<GPUContext, T> {
+  static void Transform(const GPUContext& ctx,
+                        const DenseTensor& X,
+                        DenseTensor* out,
+                        int axis) {
+    const int size_to_axis = funcs::SizeToAxis(axis, X.dims());
+    const int size_from_axis = funcs::SizeFromAxis(axis, X.dims());
+    const int size_out_axis = funcs::SizeOutAxis(axis, X.dims());
+    constexpr int thread_size = 512;
+    int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+    int64_t height = size_to_axis * size_out_axis;
+    int block_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+    DenseTensor input_tensor;
+    input_tensor.Resize(out->dims());
+    ctx.template Alloc<T>(&input_tensor);
+    paddle::framework::TensorCopy(*out, ctx.GetPlace(), &input_tensor);
+    funcs::set_constant(ctx, out, 0.0);
+    OneHotCUDAKernel<T,
+                     thread_size><<<block_size, thread_size, 0, ctx.stream()>>>(
+        height,
+        size_from_axis / size_out_axis,
+        size_out_axis,
+        std::numeric_limits<T>::lowest(),
+        input_tensor.data<T>(),
+        out->data<T>());
+  }
+};
+
+template <typename T>
+__global__ void AddGumbelNoiseCUDAKernel(const T* input_data,
+                                         T* output_data,
+                                         T* noise,
+                                         const float temperature,
+                                         int64_t n) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int step = blockDim.x * gridDim.x;
+  for (int64_t i = index; i < n; i += step) {
+    T gumbel_noise = -log(-log(noise[i]));
+    output_data[i] = (gumbel_noise + input_data[i]) / temperature;
+  }
+}
+
+template <typename T>
+struct GumbleNoiseGenerator<GPUContext, T> {
+  static void Transform(const GPUContext& ctx,
+                        const T* input_data,
+                        T* output_data,
+                        int size_to_axis,
+                        int size_from_axis,
+                        const float temperature) {
+    DenseTensor random_tensor;
+    int64_t size = size_to_axis * size_from_axis;
+    random_tensor.Resize(make_ddim({size}));
+    auto* random_data = ctx.template Alloc<T>(&random_tensor);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+
+    // generate gumbel noise
+    int device_id = ctx.GetPlace().GetDeviceId();
+    auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+    if (gen_cuda->GetIsInitPy()) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin,
+          index_sequence_begin + size,
+          thrust::device_ptr<T>(random_data),
+          UniformCUDAGenerator<T>(0.00001, 1, seed_offset.first, gen_offset));
+    } else {
+      const unsigned int seed = std::random_device()();
+      thrust::transform(index_sequence_begin,
+                        index_sequence_begin + size,
+                        thrust::device_ptr<T>(random_data),
+                        UniformCUDAGenerator<T>(0.00001, 1, seed));
+    }
+
+    // add gumbel noise to X
+    const int thread_size = 512;
+    int64_t block_size = (size + thread_size) / thread_size;
+    AddGumbelNoiseCUDAKernel<T><<<block_size, thread_size, 0, ctx.stream()>>>(
+        input_data, output_data, random_data, temperature, size);
+  }
+};
+
+}  // namespace phi
+#endif
+
+PD_REGISTER_KERNEL(
+    gumbel_softmax, GPU, ALL_LAYOUT, phi::GumbelSoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu
index 47dee820e2fbde254b21d4203e41368214ff91d7..c5eb5220537cdd471402fc12cf8b98cf3a586ebc 100644
--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -111,9 +111,9 @@ void HistogramKernel(const Context& dev_ctx,
 
     DenseTensor input_min_cpu, input_max_cpu;
     paddle::framework::TensorCopySync(
-        input_min_t, paddle::platform::CPUPlace(), &input_min_cpu);
+        input_min_t, phi::CPUPlace(), &input_min_cpu);
     paddle::framework::TensorCopySync(
-        input_max_t, paddle::platform::CPUPlace(), &input_max_cpu);
+        input_max_t, phi::CPUPlace(), &input_max_cpu);
 
     output_min = input_min_cpu.data<T>()[0];
     output_max = input_max_cpu.data<T>()[0];
@@ -149,7 +149,7 @@ void HistogramKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(histogram,
+PD_REGISTER_KERNEL(histogram,
                    GPU,
                    ALL_LAYOUT,
                    phi::HistogramKernel,
diff --git a/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..20cc2ed669adf91ee369b11ba957805839978581
--- /dev/null
+++ b/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+#include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    huber_loss_grad, GPU, ALL_LAYOUT, phi::HuberLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/huber_loss_kernel.cu b/paddle/phi/kernels/gpu/huber_loss_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..26648a260b99ec6835ed3679e7c2c0550e8f6063
--- /dev/null
+++ b/paddle/phi/kernels/gpu/huber_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/huber_loss_kernel.h"
+#include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    huber_loss, GPU, ALL_LAYOUT, phi::HuberLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/increment_kernel.cu b/paddle/phi/kernels/gpu/increment_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b3c31271911489c94e895dbea786e4bf61f56bb4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/increment_kernel.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/increment_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/increment_kernel_impl.h"
+
+PD_REGISTER_KERNEL(increment,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IncrementKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f30e8c3cdcf7a914b02284341c232de0599900cd
--- /dev/null
+++ b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/label_smooth_grad_kernel.h"
+
+namespace phi {
+template <typename T>
+struct LabelSmoothGradFunctor {
+  T epsilon;
+
+  __forceinline__ LabelSmoothGradFunctor(float epsilon_data) {
+    epsilon = static_cast<T>(epsilon_data);
+  }
+
+  __device__ __forceinline__ T operator()(const T x) const {
+    return static_cast<T>(1 - epsilon) * x;
+  }
+};
+
+template <typename T, typename Context>
+void LabelSmoothGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           float epsilon,
+                           DenseTensor* label_grad) {
+  ctx.template Alloc<T>(label_grad);
+
+  std::vector<const DenseTensor*> ins = {&out_grad};
+  std::vector<DenseTensor*> outs = {label_grad};
+  auto functor = LabelSmoothGradFunctor<T>(epsilon);
+  paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+      ctx, ins, &outs, functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(label_smooth_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LabelSmoothGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/label_smooth_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..50f7548450ce74d94d657eb37d5c2315b846ad9d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/label_smooth_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct LabelSmoothFunctor {
+  T epsilon;
+  T label_dim;
+
+  __forceinline__ LabelSmoothFunctor(float epsilon_data, int label_dim_data) {
+    epsilon = static_cast<T>(epsilon_data);
+    label_dim = static_cast<T>(label_dim_data);
+  }
+
+  __device__ __forceinline__ T operator()(const T x) const {
+    return (static_cast<T>(1 - epsilon) * x +
+            static_cast<T>(epsilon / label_dim));
+  }
+};
+
+template <typename T>
+__global__ void LabelSmoothRunDistKernel(const int N,
+                                         const float epsilon,
+                                         const int dist_numel,
+                                         const T* src,
+                                         const T* dist_data,
+                                         T* dst) {
+  CUDA_KERNEL_LOOP(idx, N) {
+    int dist_idx = idx % dist_numel;
+    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
+               static_cast<T>(epsilon) * dist_data[dist_idx];
+  }
+}
+
+template <typename T, typename Context>
+void LabelSmoothKernel(const Context& ctx,
+                       const DenseTensor& label,
+                       paddle::optional<const DenseTensor&> prior_dist,
+                       float epsilon,
+                       DenseTensor* out) {
+  auto label_dim = label.dims()[label.dims().size() - 1];
+  auto size_prob = label.numel();
+  const T* in_data = label.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+
+  if (prior_dist.get_ptr()) {
+    int threads = 512;
+    int grid = (size_prob + threads - 1) / threads;
+    auto stream = ctx.stream();
+    const auto* dist_t = prior_dist.get_ptr();
+    auto dist_numel = dist_t->numel();
+    const T* dist_data = dist_t->data<T>();
+    LabelSmoothRunDistKernel<T><<<grid, threads, 0, stream>>>(
+        size_prob, epsilon, dist_numel, in_data, dist_data, out_data);
+
+  } else {
+    std::vector<const DenseTensor*> ins = {&label};
+    std::vector<DenseTensor*> outs = {out};
+    auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        ctx, ins, &outs, functor);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    label_smooth, GPU, ALL_LAYOUT, phi::LabelSmoothKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
index 81bd69a5f12e041b613f822561dcd63690f6c828..0a5ac99fa8e458cc7d786f1fbb00d8106032719c 100644
--- a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
@@ -17,5 +17,5 @@
 #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h"
 #include "paddle/phi/kernels/lerp_grad_kernel.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     lerp_grad, GPU, ALL_LAYOUT, phi::LerpGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lerp_kernel.cu b/paddle/phi/kernels/gpu/lerp_kernel.cu
index 190248c0cd077a1228c3ab5d324f3c7bb3f59254..96010aff4e70c6399032307ce7c48be7822ce12a 100644
--- a/paddle/phi/kernels/gpu/lerp_kernel.cu
+++ b/paddle/phi/kernels/gpu/lerp_kernel.cu
@@ -17,4 +17,4 @@
 #include "paddle/phi/kernels/impl/lerp_kernel_impl.h"
 #include "paddle/phi/kernels/lerp_kernel.h"
 
-PT_REGISTER_KERNEL(lerp, GPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
+PD_REGISTER_KERNEL(lerp, GPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
index c4f4b461f2aa042307ed4e34a4c450a1dfbfd644..71b7cd8750462fdf0dad20b2b221bd18cc6dbbe6 100644
--- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
@@ -96,7 +96,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select_grad,
+PD_REGISTER_KERNEL(masked_select_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectGradKernel,
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index 8254ce4be6356eba89786467249bb2e3e59ff52e..fc4adca2f42438f464346ad83bc7e49448826bb2 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -108,7 +108,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select,
+PD_REGISTER_KERNEL(masked_select,
                    GPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectKernel,
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index c3605ce655f2bde78c01a88db4dd70bd38c98431..56e8b16ccbe0df16fdc96470a8167e6dc6abfb3c 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -92,10 +92,11 @@ DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 }  // namespace phi
 
 using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(add_raw,
+PD_REGISTER_KERNEL(add_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddRawKernel,
@@ -105,9 +106,10 @@ PT_REGISTER_KERNEL(add_raw,
                    int,
                    int64_t,
                    float16,
+                   bfloat16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract_raw,
+PD_REGISTER_KERNEL(subtract_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractRawKernel,
@@ -117,9 +119,10 @@ PT_REGISTER_KERNEL(subtract_raw,
                    int,
                    int64_t,
                    float16,
+                   bfloat16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide_raw,
+PD_REGISTER_KERNEL(divide_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::DivideRawKernel,
@@ -128,9 +131,10 @@ PT_REGISTER_KERNEL(divide_raw,
                    int,
                    int64_t,
                    float16,
+                   bfloat16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply_raw,
+PD_REGISTER_KERNEL(multiply_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::MultiplyRawKernel,
@@ -141,8 +145,9 @@ PT_REGISTER_KERNEL(multiply_raw,
                    bool,
                    float16,
                    complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(sum_raw,
+                   complex128,
+                   bfloat16) {}
+PD_REGISTER_KERNEL(sum_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::SumRawKernel,
@@ -158,7 +163,7 @@ PT_REGISTER_KERNEL(sum_raw,
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 
-PT_REGISTER_KERNEL(mean_raw,
+PD_REGISTER_KERNEL(mean_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::MeanRawKernel,
diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
index 7da5fb2c98818064af9d171256d5a93fd27c4668..ff23ebd05b52833eef9fd23efb1d8537d1013454 100644
--- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul_grad,
+PD_REGISTER_KERNEL(matmul_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulGradKernel,
@@ -30,7 +30,7 @@ PT_REGISTER_KERNEL(matmul_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_double_grad,
+PD_REGISTER_KERNEL(matmul_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulDoubleGradKernel,
@@ -40,7 +40,7 @@ PT_REGISTER_KERNEL(matmul_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_triple_grad,
+PD_REGISTER_KERNEL(matmul_triple_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulTripleGradKernel,
diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu
index 3041784e93695fcd7e16460202f7aaa9a96029ac..98be79c5f9dab5f1a72d7784dfbe1745d27bd622 100644
--- a/paddle/phi/kernels/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_kernel.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul,
+PD_REGISTER_KERNEL(matmul,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulKernel,
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4918495ff7bed83d8fee7e811017927b53faf5f9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -0,0 +1,290 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// To-do(qili93): fix this after issue resolved
+// https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202
+
+#include "paddle/phi/kernels/multinomial_kernel.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/multinomial_functor.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void NormalizeProbability(T* norm_probs,
+                                     const T* in_data,
+                                     T* sum_rows,
+                                     int64_t num_distributions,
+                                     int64_t num_categories) {
+  int id = threadIdx.x + blockIdx.x * blockDim.x +
+           blockIdx.y * gridDim.x * blockDim.x;
+  if (id < num_distributions * num_categories) {
+    PADDLE_ENFORCE(
+        in_data[id] >= 0.0,
+        "The input of multinomial distribution should be >= 0, but got %f.",
+        in_data[id]);
+    int64_t row_id = id / num_categories;
+    PADDLE_ENFORCE(sum_rows[row_id] > 0.0,
+                   "The sum of one multinomial distribution probability should "
+                   "be > 0, but got %f.",
+                   sum_rows[row_id]);
+    norm_probs[id] = in_data[id] / sum_rows[row_id];
+  }
+}
+
+template <typename T>
+__global__ void GetCumulativeProbs(T* norm_probs_data,
+                                   int64_t num_distributions,
+                                   int64_t num_categories,
+                                   T* cumulative_probs) {
+  int id = blockIdx.x;
+  thrust::inclusive_scan(thrust::device,
+                         norm_probs_data + id * num_categories,
+                         norm_probs_data + (id + 1) * num_categories,
+                         cumulative_probs + id * num_categories);
+}
+
+template <typename T>
+struct RandomGeneratorCudaFunctor {
+  unsigned int seed_;
+  __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+template <typename T>
+__device__ int binarySearchFunctor(T* cumulative_probs,
+                                   T* norm_probs_data,
+                                   int num_categories,
+                                   T rng_number) {
+  int left = 0;
+  int right = num_categories;
+
+  while (right - left > 0) {
+    int mid = left + (right - left) / 2;
+
+    T temp_prob = cumulative_probs[mid];
+    if (temp_prob < rng_number) {
+      left = mid + 1;
+    } else {
+      right = mid;
+    }
+  }
+
+  if (left == num_categories) {
+    left = num_categories - 1;
+  }
+
+  while (left >= 1 && norm_probs_data[left] == 0) left--;
+
+  return left;
+}
+
+template <typename T>
+__global__ void sampleMultinomialWithReplacement(
+    T* rng_data,
+    const int64_t num_samples,
+    int64_t* out_data,
+    const int64_t num_distributions,
+    const int64_t num_categories,
+    T* cumulative_probs,
+    T* norm_probs_data) {
+  // use binary search to get the selected category sample id.
+  // let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
+
+  // for every distribution
+  int dist = blockIdx.y;
+  // for every sample
+  int sample = blockIdx.x * blockDim.x + threadIdx.x;
+  if (sample < num_samples) {
+    T rng_number = rng_data[sample + dist * num_samples];
+
+    // Find the bucket that a uniform random number lies in
+    int selected_category =
+        binarySearchFunctor<T>(cumulative_probs + dist * num_categories,
+                               norm_probs_data + dist * num_categories,
+                               num_categories,
+                               rng_number);
+
+    out_data[sample + dist * num_samples] = selected_category;
+  }
+}
+
+template <typename T, typename Context>
+void MultinomialKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int num_samples,
+                       bool replacement,
+                       DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  auto in_dims = x.dims();
+  int64_t in_rank = in_dims.size();
+  const int64_t num_categories = in_dims[in_rank - 1];
+  const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
+
+  // If replacement is False, it's not a replaceable sample. Every category
+  // can
+  // be used only once. So after every sample, probability of the distribution
+  // will change. The implementation can't be parallelizable. Thus, call CPU
+  // implementation ``funcs::MultinomialFunctor`` to sample the distribution.
+  if (!replacement) {
+    int64_t in_data_numel = x.numel();
+    int64_t out_data_numel = out->numel();
+
+    T* cpu_in_data = new T[in_data_numel];
+    int64_t* cpu_out_data = new int64_t[out_data_numel];
+
+#ifdef PADDLE_WITH_HIP
+    hipMemcpy(
+        cpu_in_data, in_data, in_data_numel * sizeof(T), hipMemcpyDeviceToHost);
+#else
+    cudaMemcpy(cpu_in_data,
+               in_data,
+               in_data_numel * sizeof(T),
+               cudaMemcpyDeviceToHost);
+#endif
+
+    funcs::MultinomialFunctor<T>(dev_ctx,
+                                 cpu_out_data,
+                                 cpu_in_data,
+                                 num_samples,
+                                 replacement,
+                                 num_categories,
+                                 num_distributions);
+
+#ifdef PADDLE_WITH_HIP
+    hipMemcpy(out_data,
+              cpu_out_data,
+              out_data_numel * sizeof(int64_t),
+              hipMemcpyHostToDevice);
+#else
+    cudaMemcpy(out_data,
+               cpu_out_data,
+               out_data_numel * sizeof(int64_t),
+               cudaMemcpyHostToDevice);
+#endif
+
+    delete[] cpu_in_data;
+    delete[] cpu_out_data;
+    return;
+  }
+
+  // Sum of input may not be 1. To get probability in range [0, 1], calculate
+  // sum of each row of input, and then use the sum to normalize the input.
+  // sum_row_data: sum of each row
+  DenseTensor sum_rows_tensor;
+  sum_rows_tensor.Resize({num_distributions});
+  auto* sum_rows_data = dev_ctx.template Alloc<T>(&sum_rows_tensor);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  if (num_distributions == 1) {
+    auto eigen_input = EigenVector<T>::Flatten(x);
+    auto eigen_sum_rows = EigenVector<T>::Flatten(sum_rows_tensor);
+    eigen_sum_rows.device(place) =
+        eigen_input.sum(Eigen::DSizes<int, 1>(1))
+            .eval()
+            .reshape(Eigen::DSizes<int, 1>(sum_rows_tensor.dims()[0]));
+  } else {
+    auto eigen_input = EigenMatrix<T>::From(x);
+    auto eigen_sum_rows = EigenVector<T>::Flatten(sum_rows_tensor);
+    eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes<int, 1>(1));
+  }
+
+  // Normalize row of each distribution to get the probability in range [0,
+  // 1].
+  // norm_probs_data: probability of the distribution
+  DenseTensor norm_probs_tensor;
+  norm_probs_tensor.Resize({num_distributions, num_categories});
+  auto* norm_probs_data = dev_ctx.template Alloc<T>(&norm_probs_tensor);
+
+  // number of threads in a block is min(num_categories, 512)
+  dim3 block_norm(num_categories < 512 ? num_categories : 512);
+  dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
+  NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(
+      norm_probs_data,
+      in_data,
+      sum_rows_data,
+      num_distributions,
+      num_categories);
+
+  // Get cumulative probability of each distribution. It's the same function
+  // of
+  // ``cumsum`` op.
+  DenseTensor cumulative_probs_tensor;
+  cumulative_probs_tensor.Resize({num_distributions, num_categories});
+  auto* cumulative_probs = dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
+
+  dim3 block_cumsum(1);
+  dim3 grid_cumsum(num_distributions);
+  GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
+      norm_probs_data, num_distributions, num_categories, cumulative_probs);
+
+  // Generate random number for each sample.
+  std::random_device rd;
+  auto seed = rd();
+
+  DenseTensor rng_data_tensor;
+  rng_data_tensor.Resize({num_distributions, num_samples});
+  auto* rng_data = dev_ctx.template Alloc<T>(&rng_data_tensor);
+
+  thrust::counting_iterator<int64_t> index_sequence_begin(0);
+  paddle::platform::Transform<GPUContext> trans;
+  trans(dev_ctx,
+        index_sequence_begin,
+        index_sequence_begin + num_distributions * num_samples,
+        rng_data,
+        RandomGeneratorCudaFunctor<T>(seed));
+
+  // Sample the multinomial distributions.
+  dim3 block_sample(128);
+  dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions);
+  sampleMultinomialWithReplacement<
+      T><<<grid_sample, block_sample, 0, dev_ctx.stream()>>>(rng_data,
+                                                             num_samples,
+                                                             out_data,
+                                                             num_distributions,
+                                                             num_categories,
+                                                             cumulative_probs,
+                                                             norm_probs_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multinomial,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultinomialKernel,
+                   float,
+                   double) {}
+
+#endif
diff --git a/paddle/phi/kernels/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/gpu/mv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9eb8cd375ebd670907b556e7f2b8cf599d61643e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mv_grad_kernel.cu
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mv_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void MVGradDxCUDAKernel(
+    const int m, const int n, const T *dout, const T *vec, T *dx) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < m * n; idx += blockDim.x * gridDim.x) {
+    int i = idx / n;
+    int j = idx % n;
+    dx[idx] = dout[i] * vec[j];
+  }
+}
+
+template <typename T, typename Context>
+void MvGradKernel(const Context &dev_ctx,
+                  const DenseTensor &x,
+                  const DenseTensor &vec,
+                  const DenseTensor &out_grad,
+                  DenseTensor *x_grad,
+                  DenseTensor *vec_grad) {
+  auto dout = out_grad;
+  auto dx = x_grad;
+  auto dvec = vec_grad;
+
+  auto dim_x = x.dims();
+  int m = dim_x[0];
+  int n = dim_x[1];
+
+  // get data ptr
+  const T *x_data = x.data<T>();
+  const T *vec_data = vec.data<T>();
+  const T *dout_data = dout.data<T>();
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  auto stream = dev_ctx.stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n);
+
+  if (dx) {
+    T *dx_data = dev_ctx.template Alloc<T>(dx);
+
+    MVGradDxCUDAKernel<
+        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+        m, n, dout_data, vec_data, dx_data);
+  }
+
+  if (dvec) {
+    T *dvec_data = dev_ctx.template Alloc<T>(dvec);
+
+    blas.GEMV(true,
+              dim_x[0],
+              dim_x[1],
+              static_cast<T>(1),
+              x_data,
+              dout_data,
+              static_cast<T>(0),
+              dvec_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mv_grad, GPU, ALL_LAYOUT, phi::MvGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/mv_kernel.cu b/paddle/phi/kernels/gpu/mv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1faba5a62d2cd60cca054d10c9571339375d0468
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mv_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mv_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/impl/mv_kernel_impl.h"
+
+PD_REGISTER_KERNEL(mv, GPU, ALL_LAYOUT, phi::MvKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index 3530de11d35e2a1e1200b0c3f748fc5cbf5bb540..ab38a82eceb1e73bddbe07a37d72cab99929852c 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -111,7 +111,7 @@ void NormGradKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(norm_grad,
+PD_REGISTER_KERNEL(norm_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::NormGradKernel,
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index 4ed3100918edf5a3cbed19cd99c32958421f41ab..274f91b8dd6611e5d560713ed9f2338bb95c73df 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -124,7 +124,7 @@ void NormKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(norm,
+PD_REGISTER_KERNEL(norm,
                    GPU,
                    ALL_LAYOUT,
                    phi::NormKernel,
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b82cbc67485b115eb7c4dda60375c8c0cdc3b04
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pixel_shuffle_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PixelShuffleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25b240c6c1a3bedbb2b93a3a2c32f3ba88afdb63
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_shuffle_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    pixel_shuffle, GPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/poisson_grad_kernel.cu b/paddle/phi/kernels/gpu/poisson_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8c16bc51fffe5435b8b63393c197326858e49992
--- /dev/null
+++ b/paddle/phi/kernels/gpu/poisson_grad_kernel.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/poisson_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    poisson_grad, GPU, ALL_LAYOUT, phi::PoissonGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..347f70b166657622840fbd3cfb4e62aa1f87eb2a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/poisson_kernel.cu
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/poisson_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct PoissonCudaFunctor {
+ public:
+  PoissonCudaFunctor(const T* in,
+                     T* out,
+                     unsigned int seed,
+                     unsigned int offset)
+      : in_(in), out_(out), seed_(seed), offset_(offset) {}
+
+  __device__ void operator()(int64_t idx) {
+#ifdef __NVCC__
+    curandStatePhilox4_32_10_t state;
+    curand_init(seed_, idx, offset_, &state);
+    out_[idx] = static_cast<T>(curand_poisson(&state, in_[idx]));
+#elif __HIPCC__
+    hiprandStatePhilox4_32_10_t state;
+    hiprand_init(seed_, idx, offset_, &state);
+    out_[idx] = static_cast<T>(hiprand_poisson(&state, in_[idx]));
+#endif
+  }
+
+ private:
+  const T* in_;
+  T* out_;
+  const unsigned int seed_;
+  const unsigned int offset_;
+};
+
+template <typename T, typename Context>
+void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  const T* x_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  auto size = x.numel();
+
+  auto gen_cuda = ctx.GetGenerator();
+  auto seed_offset = gen_cuda->IncrementOffset(20);
+  uint64_t seed = seed_offset.first;
+  uint64_t offset = seed_offset.second;
+
+  phi::funcs::ForRange<Context> for_range(ctx, size);
+
+  PoissonCudaFunctor<T> functor(x_data, out_data, seed, offset);
+  for_range(functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    poisson, GPU, ALL_LAYOUT, phi::PoissonKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66dc5f72a5c7067a08127bce65740851b123efd3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/randint_kernel.h"
+
+#include <random>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandintRawKernel(const Context& dev_ctx,
+                      int low,
+                      int high,
+                      const ScalarArray& shape,
+                      DataType dtype,
+                      int seed,
+                      DenseTensor* out) {
+  DenseTensor tmp;
+  tmp.Resize(phi::make_ddim(shape.GetData()));
+  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
+
+  out->Resize(tmp.dims());
+  T* data = dev_ctx.template Alloc<T>(out);
+
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+  }
+
+  std::uniform_int_distribution<T> dist(low, high - 1);
+  auto numel = out->numel();
+  for (int64_t i = 0; i < numel; ++i) {
+    tmp_data[i] = dist(*engine);
+  }
+
+  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+      out->place(),
+      data,
+      tmp.place(),
+      tmp_data,
+      numel * paddle::experimental::SizeOf(out->dtype()),
+      0);
+}
+
+template <typename T, typename Context>
+void RandintKernel(const Context& dev_ctx,
+                   int low,
+                   int high,
+                   const ScalarArray& shape,
+                   DataType dtype,
+                   DenseTensor* out) {
+  RandintRawKernel<T>(dev_ctx, low, high, shape, dtype, 0, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    randint_raw, GPU, ALL_LAYOUT, phi::RandintRawKernel, int, int64_t) {}
+
+PD_REGISTER_KERNEL(randint, GPU, ALL_LAYOUT, phi::RandintKernel, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d4d90cac917a2c35e26eca0d57d1c5349b878599
--- /dev/null
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/randperm_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandpermRawKernel(
+    const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
+  DenseTensor tmp;
+  tmp.Resize(phi::make_ddim({n}));
+  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
+
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+  }
+
+  for (int i = 0; i < n; ++i) {
+    tmp_data[i] = static_cast<T>(i);
+  }
+  std::shuffle(tmp_data, tmp_data + n, *engine);
+
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
+  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+      out->place(), out_data, tmp.place(), tmp_data, size, 0);
+}
+
+template <typename T, typename Context>
+void RandpermKernel(const Context& dev_ctx,
+                    int n,
+                    DataType dtype,
+                    DenseTensor* out) {
+  RandpermRawKernel<T>(dev_ctx, n, dtype, 0, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(randperm_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RandpermRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(randperm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RandpermKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index 245605ed8a91b9424d663ee70e204033a7794c83..d9c8de21c5bc2d26cb371d03be30ed0616a27a64 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -63,7 +63,7 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale,
+PD_REGISTER_KERNEL(scale,
                    GPU,
                    ALL_LAYOUT,
                    phi::ScaleKernel,
diff --git a/paddle/phi/kernels/gpu/sign_kernel.cu.cc b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
index 950cf67d7cff5b675ea10b7e62bba8729a1523d0..1fe17a7a227ecfbb05fadb583e5ed27456f318b6 100644
--- a/paddle/phi/kernels/gpu/sign_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
@@ -23,5 +23,5 @@ limitations under the License. */
 
 using float16 = phi::dtype::float16;
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     sign, GPU, ALL_LAYOUT, phi::SignKernel, float, double, float16) {}
diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..17a39944eb04f5cecd941b07e82fb6bb97363977
--- /dev/null
+++ b/paddle/phi/kernels/gpu/size_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/size_kernel_impl.h"
+#include "paddle/phi/kernels/size_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(size,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SizeKernel,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa496d3cd391b59bef16c57dc8b7f0c39834c107
--- /dev/null
+++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..32efb9b776419efe5733ab0493c38f9c1a9c237e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index 919b0a7d4f9664c2df4c3f9e0c77200344911de6..a698b9e716140b59b10a5799647e0a1aa7a8261d 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#include "paddle/phi/kernels/gpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace phi {
 
 template <typename T, typename Context>
@@ -53,13 +53,14 @@ void SplitKernel(const Context& dev_ctx,
     paddle::operators::StridedMemcpyWithAxis0<T>(
         dev_ctx, x, shape_refer, &outs);
   } else {
-    SplitImpl<T, Context>(dev_ctx, x, shape_refer, axis, &outs);
+    phi::funcs::SplitFunctor<Context, T> functor;
+    functor(dev_ctx, x, shape_refer, axis, &outs);
   }
 }
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(split,
+PD_REGISTER_KERNEL(split,
                    GPU,
                    ALL_LAYOUT,
                    phi::SplitKernel,
diff --git a/paddle/phi/kernels/gpu/trace_grad_kernel.cu b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
index a7e4b55b4ca221dbd6776aa8a9be76e84472c545..6692c1e19b033c3945387166f9954cc71fa6de32 100644
--- a/paddle/phi/kernels/gpu/trace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/trace_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(trace_grad,
+PD_REGISTER_KERNEL(trace_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::TraceGradKernel,
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index bc8b6bc922c9176c40da06b7f80dd5556164f905..7ac7c451b00542c3e0511692dc7cad470374f2ae 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -44,7 +44,7 @@ void TraceKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trace,
+PD_REGISTER_KERNEL(trace,
                    GPU,
                    ALL_LAYOUT,
                    phi::TraceKernel,
diff --git a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
index b5427d0b73867614fb6bfc023092bf6dc07d030e..92d95e7259bf0c5cb1cf7c180eeba1d7b7ea8842 100644
--- a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
@@ -44,7 +44,7 @@ void TruncGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trunc_grad,
+PD_REGISTER_KERNEL(trunc_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::TruncGradKernel,
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index d9c0803de2832c1e1a9c82f289295b1a8ea60bc0..cc44602b657aabfe7a6f55ed1d0b2c06cb56fa9e 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -77,5 +77,5 @@ void TruncKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     trunc, GPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..12c1bf791e1691bb6eee81750b337adea713b794
--- /dev/null
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include <limits>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+// #include "paddle/phi/core/generator.h"
+
+namespace phi {
+
+template <typename T>
+struct GPUTruncatedNormal {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+
+  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
+      : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
+template <typename T>
+struct TruncatedNormalOffset {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+  int offset_;
+
+  __host__ __device__
+  TruncatedNormalOffset(T mean, T std, T numeric_min, int seed, int offset)
+      : mean(mean),
+        std(std),
+        seed(seed),
+        numeric_min(numeric_min),
+        offset_(offset) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n + offset_);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
+template <typename T, typename Context>
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const ScalarArray& shape,
+                                   float mean,
+                                   float std,
+                                   int seed,
+                                   DataType dtype,
+                                   DenseTensor* out) {
+  auto tensor = out;
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  thrust::counting_iterator<int64_t> index_sequence_begin(0);
+  int64_t size = tensor->numel();
+
+  int device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    auto seed_offset = gen_cuda->IncrementOffset(1);
+    int64_t gen_offset = size * seed_offset.second;
+    thrust::transform(index_sequence_begin,
+                      index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      TruncatedNormalOffset<T>(mean,
+                                               std,
+                                               std::numeric_limits<T>::min(),
+                                               seed_offset.first,
+                                               gen_offset));
+  } else {
+    thrust::transform(
+        index_sequence_begin,
+        index_sequence_begin + size,
+        thrust::device_ptr<T>(data),
+        GPUTruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(truncated_gaussian_random,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TruncatedGaussianRandomKernel,
+                   float) {}
diff --git a/paddle/phi/kernels/gpu/unbind_kernel.cu b/paddle/phi/kernels/gpu/unbind_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8a7aa8f6033ab9b86f87e792bc37f912562578a7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unbind_kernel.cu
@@ -0,0 +1,29 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unbind_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unbind,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnbindKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/unfold_grad_kernel.cu b/paddle/phi/kernels/gpu/unfold_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3740f59603bef4b8a245101f1ecf636d4d8423c2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unfold_grad_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h"
+#include "paddle/phi/kernels/unfold_grad_kernel.h"
+
+PD_REGISTER_KERNEL(
+    unfold_grad, GPU, ALL_LAYOUT, phi::UnfoldGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/unfold_kernel.cu b/paddle/phi/kernels/gpu/unfold_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f72a6f794e5f3b78e503d3cc27de2de6802f701
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unfold_kernel.cu
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unfold_kernel_impl.h"
+#include "paddle/phi/kernels/unfold_kernel.h"
+
+PD_REGISTER_KERNEL(unfold, GPU, ALL_LAYOUT, phi::UnfoldKernel, float, double) {}
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
similarity index 63%
rename from paddle/fluid/operators/softmax_cudnn_op.cu.h
rename to paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index dc5166f4f994f8f4af1da2dcd1f1d26de1f35ba2..45798b88bb58a3b088b2545f4a343c18ebec0ec4 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -14,18 +14,20 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using DataLayout = platform::DataLayout;
-using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = paddle::platform::ScopedTensorDescriptor;
+using GPUDNNDataLayout = paddle::platform::DataLayout;
 
 // Vectorization trait 4 * sizeof(T)
 template <typename T>
@@ -41,7 +43,7 @@ class VecT4<float> {
   using Type = int4;
 };
 template <>
-class VecT4<platform::float16> {
+class VecT4<phi::dtype::float16> {
  public:
   using Type = int2;
 };
@@ -60,7 +62,7 @@ class VecT2<float> {
   using Type = int2;
 };
 template <>
-class VecT2<platform::float16> {
+class VecT2<phi::dtype::float16> {
  public:
   using Type = int;
 };
@@ -77,7 +79,8 @@ __device__ __forceinline__ void WarpReduceSum(T* sum) {
   for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
 #pragma unroll
     for (int i = 0; i < BatchSize; ++i) {
-      T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      T sum_val =
+          paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
       sum[i] = sum[i] + sum_val;
     }
   }
@@ -89,14 +92,13 @@ __device__ __forceinline__ void WarpReduceMax(T* sum) {
   for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
 #pragma unroll
     for (int i = 0; i < BatchSize; ++i) {
-      T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      T max_val =
+          paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
       sum[i] = max(sum[i], max_val);
     }
   }
 }
 
-namespace kps = paddle::operators::kernel_primitives;
-
 template <typename Tx, typename Ty = Tx>
 struct ReduceMaxFunctor {
   inline Ty initial() { return -std::numeric_limits<Ty>::infinity(); }
@@ -248,10 +250,15 @@ One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
 For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
 api to compute max (sum) in one warp.
 */
-template <typename T, typename VecT, typename AccT, int Log2Elements,
+template <typename T,
+          typename VecT,
+          typename AccT,
+          int Log2Elements,
           bool LogMode = false>
-__global__ void WarpSoftmaxForward(T* softmax, const T* src,
-                                   const int batch_size, const int stride,
+__global__ void WarpSoftmaxForward(T* softmax,
+                                   const T* src,
+                                   const int batch_size,
+                                   const int stride,
                                    const int element_count) {
   constexpr int kDimCeil = 1 << Log2Elements;
   constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
@@ -302,9 +309,13 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src,
   }
 
   // compute max
-  kps::Reduce<AccT, kVItem, kBatchSize, 1, ReduceMaxFunctor<AccT>,
-              kMode::kLocalMode>(&max[0], &srcdata[0][0][0],
-                                 ReduceMaxFunctor<AccT>(), true);
+  kps::Reduce<AccT,
+              kVItem,
+              kBatchSize,
+              1,
+              ReduceMaxFunctor<AccT>,
+              kMode::kLocalMode>(
+      &max[0], &srcdata[0][0][0], ReduceMaxFunctor<AccT>(), true);
   WarpReduceMax<AccT, kBatchSize, kWarpSize>(max);
 
 // compute sum
@@ -313,9 +324,13 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src,
     kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpSubFunctor<AccT>>(
         &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor<AccT>(max[i]));
   }
-  kps::Reduce<AccT, kVItem, kBatchSize, 1, kps::AddFunctor<AccT>,
-              kMode::kLocalMode>(&sum[0], &srcdata[0][0][0],
-                                 kps::AddFunctor<AccT>(), true);
+  kps::Reduce<AccT,
+              kVItem,
+              kBatchSize,
+              1,
+              kps::AddFunctor<AccT>,
+              kMode::kLocalMode>(
+      &sum[0], &srcdata[0][0][0], kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
 // write data to global memory
@@ -340,10 +355,16 @@ One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
 For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
 api to compute max (sum) in one warp.
 */
-template <typename T, typename VecT, typename AccT, int Log2Elements,
+template <typename T,
+          typename VecT,
+          typename AccT,
+          int Log2Elements,
           bool LogMode = false>
-__global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
-                                    int batch_size, int stride,
+__global__ void WarpSoftmaxBackward(T* dst,
+                                    const T* grad,
+                                    const T* src,
+                                    int batch_size,
+                                    int stride,
                                     int element_count) {
   constexpr int kVSize = sizeof(VecT) / sizeof(T);
   constexpr int kDimCeil = 1 << Log2Elements;
@@ -403,7 +424,11 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
   AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[0][0][0]);
   kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
       &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
-  kps::Reduce<AccT, kVItem, kBatchSize, 1, kps::AddFunctor<AccT>,
+  kps::Reduce<AccT,
+              kVItem,
+              kBatchSize,
+              1,
+              kps::AddFunctor<AccT>,
               kps::details::ReduceMode::kLocalMode>(
       &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
@@ -429,7 +454,10 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
 
 #define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)                      \
   case Log2Elements:                                                       \
-    WarpSoftmaxForward<T, VecT, AccT, Log2Elements,                        \
+    WarpSoftmaxForward<T,                                                  \
+                       VecT,                                               \
+                       AccT,                                               \
+                       Log2Elements,                                       \
                        LogMode><<<blocks, threads, 0, dev_ctx.stream()>>>( \
         dst, src, batch_size, stride, element_count);                      \
     break;
@@ -438,12 +466,16 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
   Wrapper of softmax formward with template instantiation on size of input.
 */
 template <typename T, typename VecT, bool LogMode>
-void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads,
-                              const platform::CUDADeviceContext& dev_ctx,
-                              T* dst, const T* src, const int batch_size,
-                              const int stride, const int element_count,
+void SwitchWarpSoftmaxForward(const int blocks,
+                              const dim3 threads,
+                              const GPUContext& dev_ctx,
+                              T* dst,
+                              const T* src,
+                              const int batch_size,
+                              const int stride,
+                              const int element_count,
                               int Log2Elements) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   switch (Log2Elements) {
     SOFTMAX_WARP_FORWARD_CASE(0, AccT);
     SOFTMAX_WARP_FORWARD_CASE(1, AccT);
@@ -462,7 +494,10 @@ void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads,
 
 #define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT)                      \
   case Log2Elements:                                                        \
-    WarpSoftmaxBackward<T, VecT, AccT, Log2Elements,                        \
+    WarpSoftmaxBackward<T,                                                  \
+                        VecT,                                               \
+                        AccT,                                               \
+                        Log2Elements,                                       \
                         LogMode><<<blocks, threads, 0, dev_ctx.stream()>>>( \
         dst, grad, src, batch_size, stride, element_count);                 \
     break;
@@ -471,12 +506,17 @@ void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads,
 Wrapper of softmax backward with template instantiation on size of input.
 */
 template <typename T, typename VecT, bool LogMode>
-void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads,
-                               const platform::CUDADeviceContext& dev_ctx,
-                               T* dst, const T* grad, const T* src,
-                               const int batch_size, const int stride,
-                               const int element_count, int Log2Elements) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
+void SwitchWarpSoftmaxBackward(const int blocks,
+                               const dim3 threads,
+                               const GPUContext& dev_ctx,
+                               T* dst,
+                               const T* grad,
+                               const T* src,
+                               const int batch_size,
+                               const int stride,
+                               const int element_count,
+                               int Log2Elements) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   switch (Log2Elements) {
     SOFTMAX_WARP_BACKWARD_CASE(0, AccT);
     SOFTMAX_WARP_BACKWARD_CASE(1, AccT);
@@ -501,12 +541,12 @@ void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads,
  * Better performence when axis != -1
  */
 
-static void GetGridDim(int high_dim, int mid_dim, int low_dim,
-                       const dim3& block, dim3* grid) {
-  int device_id = paddle::platform::GetCurrentDeviceId();
-  int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+static void GetGridDim(
+    int high_dim, int mid_dim, int low_dim, const dim3& block, dim3* grid) {
+  int device_id = phi::backends::gpu::GetCurrentDeviceId();
+  int max_mp = phi::backends::gpu::GetGPUMultiProcessors(device_id);
   int max_threads_per_mp =
-      paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+      phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
   int max_threads = max_threads_per_mp * max_mp;
   int num_threads = block.x * block.y;
   int max_num_blocks = max_threads / num_threads;
@@ -532,16 +572,17 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
   block->x = std::min(block_x, static_cast<int>(max_num_threads / block->y));
 }
 
-static void GetLaunchConfig(int high_dim, int mid_dim, int low_dim, dim3* grid,
-                            dim3* block) {
+static void GetLaunchConfig(
+    int high_dim, int mid_dim, int low_dim, dim3* grid, dim3* block) {
   GetBlockDim(mid_dim, low_dim, block);
   GetGridDim(high_dim, mid_dim, low_dim, *block, grid);
 }
 
-template <typename T, typename AccT,
+template <typename T,
+          typename AccT,
           template <typename, typename> class Functor>
-__global__ void NormalSoftmaxForward(T* output, const T* input, int high_dim,
-                                     int mid_dim, int low_dim) {
+__global__ void NormalSoftmaxForward(
+    T* output, const T* input, int high_dim, int mid_dim, int low_dim) {
   using kMode = kps::details::ReduceMode;
   const int high_stride = mid_dim * low_dim;
   const int mid_stride = low_dim;
@@ -584,11 +625,15 @@ __global__ void NormalSoftmaxForward(T* output, const T* input, int high_dim,
   }
 }
 
-template <typename T, typename AccT,
+template <typename T,
+          typename AccT,
           template <typename, typename> class Functor>
-__global__ void NormalSoftmaxBackward(T* input_grad, const T* output_grad,
-                                      const T* output, int high_dim,
-                                      int mid_dim, int low_dim) {
+__global__ void NormalSoftmaxBackward(T* input_grad,
+                                      const T* output_grad,
+                                      const T* output,
+                                      int high_dim,
+                                      int mid_dim,
+                                      int low_dim) {
   using kMode = kps::details::ReduceMode;
   const int high_stride = mid_dim * low_dim;
   const int mid_stride = low_dim;
@@ -622,58 +667,79 @@ __global__ void NormalSoftmaxBackward(T* input_grad, const T* output_grad,
 }
 
 template <typename T, bool LogMode = false>
-void LaunchNormalSoftmaxForward(const platform::CUDADeviceContext& dev_ctx,
-                                T* output_data, const T* input_data,
-                                int high_dim, int mid_dim, int low_dim) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
+void LaunchNormalSoftmaxForward(const GPUContext& dev_ctx,
+                                T* output_data,
+                                const T* input_data,
+                                int high_dim,
+                                int mid_dim,
+                                int low_dim) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
     NormalSoftmaxForward<
-        T, AccT,
+        T,
+        AccT,
         LogSoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
         output_data, input_data, high_dim, mid_dim, low_dim);
   } else {
     NormalSoftmaxForward<
-        T, AccT, SoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+        T,
+        AccT,
+        SoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
         output_data, input_data, high_dim, mid_dim, low_dim);
   }
 }
 
 template <typename T, bool LogMode = false>
-void LaunchNormalSoftmaxBackward(const platform::CUDADeviceContext& dev_ctx,
-                                 T* input_grad_data, const T* output_grad_data,
-                                 const T* output_data, int high_dim,
-                                 int mid_dim, int low_dim) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
+void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
+                                 T* input_grad_data,
+                                 const T* output_grad_data,
+                                 const T* output_data,
+                                 int high_dim,
+                                 int mid_dim,
+                                 int low_dim) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
     NormalSoftmaxBackward<
-        T, AccT,
+        T,
+        AccT,
         LogSoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
-        input_grad_data, output_grad_data, output_data, high_dim, mid_dim,
+        input_grad_data,
+        output_grad_data,
+        output_data,
+        high_dim,
+        mid_dim,
         low_dim);
   } else {
     NormalSoftmaxBackward<
-        T, AccT, SoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
-        input_grad_data, output_grad_data, output_data, high_dim, mid_dim,
+        T,
+        AccT,
+        SoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+        input_grad_data,
+        output_grad_data,
+        output_data,
+        high_dim,
+        mid_dim,
         low_dim);
   }
 }
 
 template <typename T, bool LogMode = false>
-void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                                    const Tensor& x, const int input_axis,
-                                    Tensor* out) {
+void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
+                                    const DenseTensor& x,
+                                    const int input_axis,
+                                    DenseTensor* out) {
   auto* out_data = out->data<T>();
 
   auto dims = x.dims();
   const int rank = dims.size();
-  const int axis = CanonicalAxis(input_axis, rank);
+  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
   const int dim = dims[axis];
-  const int N = SizeToAxis(axis, dims);
-  const int D = SizeOutAxis(axis, dims);
+  const int N = phi::funcs::SizeToAxis(axis, dims);
+  const int D = phi::funcs::SizeOutAxis(axis, dims);
 
   constexpr int max_dim = 512;
   constexpr int warps_per_block = 4;
@@ -697,25 +763,43 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     using T2 = typename VecT2<T>::Type;
 
     if (dim % 4 == 0) {
-      SwitchWarpSoftmaxForward<T, T4, LogMode>(blocks, threads, dev_ctx,
-                                               out_data, x.data<T>(), N, dim,
-                                               dim, kDimLog2);
+      SwitchWarpSoftmaxForward<T, T4, LogMode>(blocks,
+                                               threads,
+                                               dev_ctx,
+                                               out_data,
+                                               x.data<T>(),
+                                               N,
+                                               dim,
+                                               dim,
+                                               kDimLog2);
     } else if (dim % 2 == 0) {
-      SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks, threads, dev_ctx,
-                                               out_data, x.data<T>(), N, dim,
-                                               dim, kDimLog2);
+      SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks,
+                                               threads,
+                                               dev_ctx,
+                                               out_data,
+                                               x.data<T>(),
+                                               N,
+                                               dim,
+                                               dim,
+                                               kDimLog2);
     } else {
-      SwitchWarpSoftmaxForward<T, T, LogMode>(blocks, threads, dev_ctx,
-                                              out_data, x.data<T>(), N, dim,
-                                              dim, kDimLog2);
+      SwitchWarpSoftmaxForward<T, T, LogMode>(blocks,
+                                              threads,
+                                              dev_ctx,
+                                              out_data,
+                                              x.data<T>(),
+                                              N,
+                                              dim,
+                                              dim,
+                                              kDimLog2);
     }
   } else if (D > 1) {
-    LaunchNormalSoftmaxForward<T, LogMode>(dev_ctx, out_data, x.data<T>(), N,
-                                           dim, D);
+    LaunchNormalSoftmaxForward<T, LogMode>(
+        dev_ctx, out_data, x.data<T>(), N, dim, D);
   } else {
     ScopedTensorDescriptor desc;
     std::vector<int> tensor_dims = {N, dim, D, 1};
-    DataLayout layout = DataLayout::kNCHW;
+    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
 #else
@@ -728,46 +812,74 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data,
-          MIOPEN_SOFTMAX_LOG, mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::miopenSoftmaxForward_V2(
+              handle,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              x.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              out_data,
+              MIOPEN_SOFTMAX_LOG,
+              mode));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data,
-          MIOPEN_SOFTMAX_ACCURATE, mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::miopenSoftmaxForward_V2(
+              handle,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              x.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              out_data,
+              MIOPEN_SOFTMAX_ACCURATE,
+              mode));
     }
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-          handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
-          desc_, x.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
+          handle,
+          CUDNN_SOFTMAX_LOG,
+          mode,
+          paddle::platform::CudnnDataType<T>::kOne(),
+          desc_,
+          x.data<T>(),
+          paddle::platform::CudnnDataType<T>::kZero(),
+          desc_,
           out_data));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
+          handle,
+          CUDNN_SOFTMAX_ACCURATE,
+          mode,
+          paddle::platform::CudnnDataType<T>::kOne(),
+          desc_,
+          x.data<T>(),
+          paddle::platform::CudnnDataType<T>::kZero(),
+          desc_,
+          out_data));
     }
 #endif
   }
 }
 
 template <typename T, bool LogMode = false>
-void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                                     const Tensor& out, const Tensor& dout,
-                                     const int input_axis, Tensor* dx) {
+void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
+                                     const DenseTensor& out,
+                                     const DenseTensor& dout,
+                                     const int input_axis,
+                                     DenseTensor* dx) {
   auto* dx_data = dx->data<T>();
 
   auto dims = out.dims();
   const int rank = dims.size();
-  const int axis = CanonicalAxis(input_axis, rank);
+  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
   const int dim = dims[axis];
-  const int N = SizeToAxis(axis, dims);
-  const int D = SizeOutAxis(axis, dims);
+  const int N = phi::funcs::SizeToAxis(axis, dims);
+  const int D = phi::funcs::SizeOutAxis(axis, dims);
 
   constexpr int max_dim = 512;
   constexpr int warps_per_block = 4;
@@ -788,25 +900,46 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     using T4 = typename VecT4<T>::Type;
     using T2 = typename VecT2<T>::Type;
     if (dim % 4 == 0) {
-      SwitchWarpSoftmaxBackward<T, T4, LogMode>(
-          blocks, threads, dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N,
-          dim, dim, kDimLog2);
+      SwitchWarpSoftmaxBackward<T, T4, LogMode>(blocks,
+                                                threads,
+                                                dev_ctx,
+                                                dx_data,
+                                                dout.data<T>(),
+                                                out.data<T>(),
+                                                N,
+                                                dim,
+                                                dim,
+                                                kDimLog2);
     } else if (dim % 2 == 0) {
-      SwitchWarpSoftmaxBackward<T, T2, LogMode>(
-          blocks, threads, dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N,
-          dim, dim, kDimLog2);
+      SwitchWarpSoftmaxBackward<T, T2, LogMode>(blocks,
+                                                threads,
+                                                dev_ctx,
+                                                dx_data,
+                                                dout.data<T>(),
+                                                out.data<T>(),
+                                                N,
+                                                dim,
+                                                dim,
+                                                kDimLog2);
     } else {
-      SwitchWarpSoftmaxBackward<T, T, LogMode>(
-          blocks, threads, dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N,
-          dim, dim, kDimLog2);
+      SwitchWarpSoftmaxBackward<T, T, LogMode>(blocks,
+                                               threads,
+                                               dev_ctx,
+                                               dx_data,
+                                               dout.data<T>(),
+                                               out.data<T>(),
+                                               N,
+                                               dim,
+                                               dim,
+                                               kDimLog2);
     }
   } else if (D > 1) {
-    LaunchNormalSoftmaxBackward<T, LogMode>(dev_ctx, dx_data, dout.data<T>(),
-                                            out.data<T>(), N, dim, D);
+    LaunchNormalSoftmaxBackward<T, LogMode>(
+        dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D);
   } else {
     ScopedTensorDescriptor desc;
     std::vector<int> tensor_dims = {N, dim, D, 1};
-    DataLayout layout = DataLayout::kNCHW;
+    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
 #else
@@ -819,33 +952,68 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(),
-          desc_, dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data, MIOPEN_SOFTMAX_LOG, mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::miopenSoftmaxBackward_V2(
+              handle,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              out.data<T>(),
+              desc_,
+              dout.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              dx_data,
+              MIOPEN_SOFTMAX_LOG,
+              mode));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(),
-          desc_, dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data, MIOPEN_SOFTMAX_ACCURATE, mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::miopenSoftmaxBackward_V2(
+              handle,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              out.data<T>(),
+              desc_,
+              dout.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              dx_data,
+              MIOPEN_SOFTMAX_ACCURATE,
+              mode));
     }
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
-          handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
-          desc_, out.data<T>(), desc_, dout.data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, dx_data));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnSoftmaxBackward(
+              handle,
+              CUDNN_SOFTMAX_LOG,
+              mode,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              out.data<T>(),
+              desc_,
+              dout.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              dx_data));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(), desc_,
-          dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnSoftmaxBackward(
+              handle,
+              CUDNN_SOFTMAX_ACCURATE,
+              mode,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              out.data<T>(),
+              desc_,
+              dout.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              dx_data));
     }
 #endif
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
new file mode 100644
index 0000000000000000000000000000000000000000..56e5fef6e37e41dd6405af25c214013211670246
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
@@ -0,0 +1,50 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxGradGPUDNNKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx, out, out_grad, axis, x_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradGPUDNNKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
new file mode 100644
index 0000000000000000000000000000000000000000..427d1729a13a8ea8e0caf4aa534b012af76e79f2
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxRawGPUDNNKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            int axis,
+                            DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  SoftmaxForwardCUDAKernelDriver<T>(dev_ctx, x, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(softmax,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawGPUDNNKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(softmax,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gumbel_softmax_grad_kernel.h b/paddle/phi/kernels/gumbel_softmax_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3f02d90fcb6ad347468fc4943d6ab445cb1c5f0
--- /dev/null
+++ b/paddle/phi/kernels/gumbel_softmax_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void GumbelSoftmaxGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             int axis,
+                             DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gumbel_softmax_kernel.h b/paddle/phi/kernels/gumbel_softmax_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..46edb9750dd34832b1c908822f6e322e548db951
--- /dev/null
+++ b/paddle/phi/kernels/gumbel_softmax_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void GumbelSoftmaxKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         float temperature,
+                         bool hard,
+                         int axis,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/huber_loss_grad_kernel.h b/paddle/phi/kernels/huber_loss_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6246b1553197993e7c4cba2342120fa81f98ac4
--- /dev/null
+++ b/paddle/phi/kernels/huber_loss_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HuberLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& residual,
+                         const DenseTensor& out_grad,
+                         float delta,
+                         DenseTensor* input_grad,
+                         DenseTensor* label_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/huber_loss_kernel.h b/paddle/phi/kernels/huber_loss_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3533a9ec6ded525f304e68aa510b57f9989ccce9
--- /dev/null
+++ b/paddle/phi/kernels/huber_loss_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HuberLossKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& label,
+                     float delta,
+                     DenseTensor* out,
+                     DenseTensor* residual);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
index 939bc49c9fc671ac148688ca6556e982d8ee5523..78c25200bbd284489ee431cdb78a81748565050b 100644
--- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
@@ -14,12 +14,33 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/abs_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
+#if defined(__NVCC__)
+template <typename T>
+void AbsGradKernelImpl(const GPUContext& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& dout,
+                       DenseTensor* dx) {
+  std::vector<const DenseTensor*> ins = {&x, &dout};
+  std::vector<DenseTensor*> outs = {dx};
+  dev_ctx.Alloc<T>(dx);
+  phi::funcs::AbsGradCUDAFunctor<T> abs_grad_cuda_functor;
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, abs_grad_cuda_functor);
+}
+template <typename T, typename Context>
+void AbsGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& dout,
+                   DenseTensor* dx) {
+  AbsGradKernelImpl<T>(dev_ctx, x, dout, dx);
+}
+#else
 template <typename T, typename Context>
 void AbsGradKernel(const Context& ctx,
                    const DenseTensor& x,
@@ -32,11 +53,12 @@ void AbsGradKernel(const Context& ctx,
   ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
   auto* dx_data = dx->data<T>();
 
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::funcs::AbsGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
   for_range(functor);
 }
 
+#endif
 template <typename T, typename Context>
 void AbsDoubleGradKernel(const Context& ctx,
                          const DenseTensor& x,
@@ -48,7 +70,7 @@ void AbsDoubleGradKernel(const Context& ctx,
   ctx.template Alloc<T>(ddout, static_cast<size_t>(numel * sizeof(T)));
   auto* ddout_data = ddout->data<T>();
 
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::funcs::AbsGradGradFunctor<T> functor(
       ddx_data, x_data, ddout_data, numel);
   for_range(functor);
diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5efd22a31daa0def31102f46afce6a857ec1849
--- /dev/null
+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
+
+#include <type_traits>
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
+
+template <typename T, typename Context>
+void AddmmGradKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     float alpha,
+                     float beta,
+                     DenseTensor* input_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  auto in_dims = input.dims();
+  int total_elems = 0;
+
+  VLOG(3) << "alpha: " << alpha << " beta: " << beta;
+
+  if (input_grad != nullptr) {
+    input_grad->set_lod(out_grad.lod());
+  }
+  if (x_grad != nullptr) {
+    x_grad->set_lod(x.lod());
+  }
+  if (y_grad != nullptr) {
+    y_grad->set_lod(y.lod());
+  }
+
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+  if (input_grad) {
+    dev_ctx.template Alloc<T>(input_grad);
+    total_elems = in_dims[0] * in_dims[1];
+    auto& place = *dev_ctx.eigen_device();
+    auto eigen_dout = PhiEigenTensor<T, 2>::From(out_grad);
+    auto eigen_dinput = PhiEigenTensor<T, 2>::From(*input_grad);
+
+    bool row_compress = in_dims[0] != out_grad.dims()[0];
+    bool col_compress = in_dims[1] != out_grad.dims()[1];
+    auto eigen_dinput_shape =
+        Array2(input_grad->dims()[0], input_grad->dims()[1]);
+
+    if (row_compress && col_compress) {
+      eigen_dinput.device(place) =
+          eigen_dout.sum().eval().reshape(eigen_dinput_shape);
+    } else if (row_compress) {
+      eigen_dinput.device(place) =
+          eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape);
+    } else if (col_compress) {
+      eigen_dinput.device(place) =
+          eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape);
+    } else {
+      blas.VCOPY(total_elems, out_grad.data<T>(), input_grad->data<T>());
+    }
+
+    blas.SCAL(total_elems, beta, input_grad->data<T>());
+  }
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    total_elems = x.dims()[0] * x.dims()[1];
+    // x_grad = out_grad * y'. x_grad: M x K, out_grad : M x N, y : K x N
+    blas.MatMul(out_grad, false, y, true, x_grad);
+    blas.SCAL(total_elems, alpha, x_grad->data<T>());
+  }
+  if (y_grad) {
+    dev_ctx.template Alloc<T>(y_grad);
+    total_elems = x.dims()[1] * y.dims()[1];
+    // y_grad = x' * out_grad. y_grad K x N, out_grad : M x N, x : M x K
+    blas.MatMul(x, true, out_grad, false, y_grad);
+    blas.SCAL(total_elems, alpha, y_grad->data<T>());
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7afdfd622e63e88a99891d0e8cf5942f9454858
--- /dev/null
+++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+
+#include <type_traits>
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
+
+template <typename T, typename Context>
+void AddmmKernel(const Context& dev_ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 float alpha,
+                 float beta,
+                 DenseTensor* out) {
+  auto input_dims = input.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  // broadcast mode check
+  if (x_dims[0] != input_dims[0]) {
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      1,
+                      errors::InvalidArgument(
+                          "When x_dims[0] is not equal with input_dims[0], "
+                          "input_dims[0] must be 1 but got %s",
+                          input_dims[0]));
+    PADDLE_ENFORCE_EQ(y_dims[1] == input_dims[1] || input_dims[1] == 1,
+                      true,
+                      errors::InvalidArgument(
+                          "The input tensor shape mismatch, input shape=[%s], "
+                          "x shape=[%s], y shape=[%s]",
+                          input_dims,
+                          x_dims,
+                          y_dims));
+  }
+  // broadcast mode check
+  if (y_dims[1] != input_dims[1]) {
+    PADDLE_ENFORCE_EQ(input_dims[1],
+                      1,
+                      errors::InvalidArgument(
+                          "When y_dims[1] is not equal with input_dims[0], "
+                          "input_dims[0] must be 1 but got %s",
+                          input_dims[1]));
+    PADDLE_ENFORCE_EQ(x_dims[0] == input_dims[0] || input_dims[0] == 1,
+                      true,
+                      errors::InvalidArgument(
+                          "The input tensor shape mismatch, input shape=[%s], "
+                          "x shape=[%s], y shape=[%s]",
+                          input_dims,
+                          x_dims,
+                          y_dims));
+  }
+  // broadcast mode check
+  PADDLE_ENFORCE_EQ(
+      x_dims[1],
+      y_dims[0],
+      errors::InvalidArgument(
+          "The input tensor X's width must be equal with matrix Y' height. "
+          "But received X's shape = [%s], Y's shape = [%s].",
+          x_dims[1],
+          y_dims[0]));
+
+  dev_ctx.template Alloc<T>(out);
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+
+  // calc broadcast dim
+  Array2 bcast_dims;
+  bcast_dims[0] = x_dims[0] / input_dims[0];
+  bcast_dims[1] = y_dims[1] / input_dims[1];
+  VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]";
+  // broadcast using eigen
+  auto eigen_input = PhiEigenTensor<T, 2>::From(input);
+  auto eigen_out = PhiEigenTensor<T, 2>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
+      place, eigen_out, eigen_input, bcast_dims);
+
+  blas.GEMM(false,
+            false,
+            x_dims[0],
+            y_dims[1],
+            x_dims[1],
+            alpha,
+            x.data<T>(),
+            x_dims[1],
+            y.data<T>(),
+            y_dims[1],
+            beta,
+            out->data<T>(),
+            y_dims[1]);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0dd18298518ab351918aa2492eb48d11d3cf1d7
--- /dev/null
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+
+// dx1 = dout * x2 / ((x1)^2 + (x2)^2)
+// dx2 = - dout * x1 / ((x1)^2 + (x2)^2)
+template <typename T>
+struct Atan2GradFunctor {
+  Atan2GradFunctor(
+      const T* x1, const T* x2, const T* dout, T* dx1, T* dx2, int64_t numel)
+      : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    float x1 = static_cast<float>(x1_[idx]);
+    float x2 = static_cast<float>(x2_[idx]);
+    float x = x1 * x1 + x2 * x2;
+    dx1_[idx] = static_cast<T>(static_cast<float>(dout_[idx]) * x2 / x);
+    dx2_[idx] = static_cast<T>(-static_cast<float>(dout_[idx]) * x1 / x);
+  }
+
+  const T* x1_;
+  const T* x2_;
+  const T* dout_;
+  T* dx1_;
+  T* dx2_;
+  int64_t numel_;
+};
+
+template <>
+struct Atan2GradFunctor<double> {
+  Atan2GradFunctor(const double* x1,
+                   const double* x2,
+                   const double* dout,
+                   double* dx1,
+                   double* dx2,
+                   int64_t numel)
+      : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    auto x = x1_[idx] * x1_[idx] + x2_[idx] * x2_[idx];
+    dx1_[idx] = dout_[idx] * x2_[idx] / x;
+    dx2_[idx] = -dout_[idx] * x1_[idx] / x;
+  }
+
+  const double* x1_;
+  const double* x2_;
+  const double* dout_;
+  double* dx1_;
+  double* dx2_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void Atan2GradKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  auto numel = x.numel();
+  auto x_data = x.data<T>();
+  auto y_data = y.data<T>();
+  auto out_grad_data = out_grad.data<T>();
+
+  auto* x_grad_data =
+      ctx.template Alloc<T>(x_grad, size_t(x.numel() * sizeof(T)));
+  auto* y_grad_data =
+      ctx.template Alloc<T>(y_grad, size_t(y.numel() * sizeof(T)));
+
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
+  phi::Atan2GradFunctor<T> functor(
+      x_data, y_data, out_grad_data, x_grad_data, y_grad_data, numel);
+  for_range(functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cae914e2f61555377f7a41b3d89cdbb2b589247
--- /dev/null
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+template <typename T>
+struct Atan2Out {
+  using type = T;
+};
+
+template <>
+struct Atan2Out<int32_t> {
+  using type = double;
+};
+
+template <>
+struct Atan2Out<int64_t> {
+  using type = double;
+};
+
+template <typename T>
+struct Atan2Functor {
+  Atan2Functor(const T* x1,
+               const T* x2,
+               typename Atan2Out<T>::type* out,
+               int64_t numel)
+      : x1_(x1), x2_(x2), out_(out), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    out_[idx] = static_cast<typename Atan2Out<T>::type>(
+        ::atan2f(static_cast<float>(x1_[idx]), static_cast<float>(x2_[idx])));
+  }
+
+  const T* x1_;
+  const T* x2_;
+  typename Atan2Out<T>::type* out_;
+  int64_t numel_;
+};
+
+template <>
+struct Atan2Functor<double> {
+  Atan2Functor(const double* x1, const double* x2, double* out, int64_t numel)
+      : x1_(x1), x2_(x2), out_(out), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    out_[idx] = ::atan2(x1_[idx], x2_[idx]);
+  }
+
+  const double* x1_;
+  const double* x2_;
+  double* out_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void Atan2Kernel(const Context& ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out) {
+  auto numel = x.numel();
+  auto x_data = x.data<T>();
+  auto y_data = y.data<T>();
+
+  auto* out_data = ctx.template Alloc<typename Atan2Out<T>::type>(
+      out, size_t(x.numel() * sizeof(typename Atan2Out<T>::type)));
+
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
+  phi::Atan2Functor<T> functor(x_data, y_data, out_data, numel);
+  for_range(functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c199833b42a99558966e785e74e172c3a0c1c14f
--- /dev/null
+++ b/paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BilinearTensorProductGradKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& y,
+                                     const DenseTensor& weight,
+                                     const DenseTensor& dout,
+                                     DenseTensor* dx,
+                                     DenseTensor* dy,
+                                     DenseTensor* dweight,
+                                     DenseTensor* dbias) {
+  auto batch_size = x.dims()[0];
+  auto weight_dims = weight.dims();
+  int out_dim = weight_dims[0];
+  auto x_dim = weight_dims[1];
+  auto y_dim = weight_dims[2];
+
+  auto x_mat = EigenMatrix<T>::From(x);
+  auto y_mat = EigenMatrix<T>::From(y);
+  auto dout_mat = EigenMatrix<T>::From(dout);
+  auto& place = *ctx.eigen_device();
+  // Create the intermediate variable to calculate the Output(Y@Grad).
+  DenseTensor x_scale;
+  x_scale.Resize(make_ddim({batch_size, x_dim}));
+  ctx.template Alloc<T>(&x_scale);
+  auto x_scale_mat = EigenMatrix<T>::From(x_scale);
+
+  // Create the intermediate variable to calculate the Output(X@Grad).
+  DenseTensor y_scale;
+  y_scale.Resize(make_ddim({batch_size, y_dim}));
+  ctx.template Alloc<T>(&y_scale);
+  auto y_scale_mat = EigenMatrix<T>::From(y_scale);
+
+  funcs::SetConstant<Context, T> set_zero;
+
+  if (dx) {
+    ctx.template Alloc<T>(dx);
+    set_zero(ctx, dx, static_cast<T>(0));
+  }
+
+  if (dy) {
+    ctx.template Alloc<T>(dy);
+    set_zero(ctx, dy, static_cast<T>(0));
+  }
+
+  if (dweight) {
+    ctx.template Alloc<T>(dweight);
+  }
+
+  auto blas = funcs::GetBlas<Context, T>(ctx);
+
+  // Caculate the Output(X@Grad) and Output(Y@Grad).
+  if (dx || dy || dweight) {
+    Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
+    Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+    Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+
+    for (int i = 0; i < out_dim; ++i) {
+      DenseTensor weight_i =
+          weight.Slice(i, i + 1).Resize(make_ddim({x_dim, y_dim}));
+      auto output_vec = dout_mat.chip(i, 1);
+
+      if (dx) {
+        y_scale_mat.device(place) =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_x) *
+            y_mat;
+        blas.GEMM(CblasNoTrans,
+                  CblasTrans,
+                  batch_size,
+                  x_dim,
+                  y_dim,
+                  1,
+                  y_scale.data<T>(),
+                  weight_i.data<T>(),
+                  1,
+                  dx->data<T>());
+      }
+
+      if (dy || dweight) {
+        auto output_vec_y =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_y);
+        x_scale_mat.device(place) = output_vec_y * x_mat;
+        if (dy) {
+          blas.GEMM(CblasNoTrans,
+                    CblasNoTrans,
+                    batch_size,
+                    y_dim,
+                    x_dim,
+                    1,
+                    x_scale.data<T>(),
+                    weight_i.data<T>(),
+                    1,
+                    dy->data<T>());
+        }
+        if (dweight) {
+          DenseTensor dweight_i =
+              dweight->Slice(i, i + 1).Resize(make_ddim({x_dim, y_dim}));
+          blas.GEMM(CblasTrans,
+                    CblasNoTrans,
+                    x_dim,
+                    y_dim,
+                    batch_size,
+                    1,
+                    x_scale.data<T>(),
+                    y.data<T>(),
+                    0,
+                    dweight_i.data<T>());
+        }
+      }
+    }
+  }
+
+  // calculate the gradient of Input(Bias).
+  if (dbias) {
+    ctx.template Alloc<T>(dbias);
+    auto dbias_mat = EigenVector<T>::Flatten(*dbias);
+    dbias_mat.device(place) = dout_mat.sum(Eigen::DSizes<int, 1>(0));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f30a4b958ebe05182bb3061e44312880e4cd8e3
--- /dev/null
+++ b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BilinearTensorProductKernel(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& weight,
+                                 paddle::optional<const DenseTensor&> bias,
+                                 DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+
+  auto y_mat = EigenMatrix<T>::From(y);
+  auto output_mat = EigenMatrix<T>::From(*out);
+
+  auto batch_size = x.dims()[0];
+  auto weight_dims = weight.dims();
+  int out_dim = weight_dims[0];
+  auto x_dim = weight_dims[1];
+  auto y_dim = weight_dims[2];
+  auto& place = *ctx.eigen_device();
+
+  // Create the intermediate variable to calculate the result of
+  // Input(X) multiplied by Input(Weight_i), the formula is:
+  // left_mul = X Weight_i.
+  DenseTensor left_mul;
+  left_mul.Resize(phi::make_ddim({batch_size, y_dim}));
+  ctx.template Alloc<T>(&left_mul);
+  auto left_mul_mat = EigenMatrix<T>::From(left_mul);
+
+  for (int i = 0; i < out_dim; ++i) {
+    auto output_col_vec = output_mat.chip(i, 1);
+    DenseTensor weight_mat =
+        weight.Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim}));
+    phi::funcs::GetBlas<Context, T>(ctx).GEMM(CblasNoTrans,
+                                              CblasNoTrans,
+                                              batch_size,
+                                              y_dim,
+                                              x_dim,
+                                              1,
+                                              x.data<T>(),
+                                              weight_mat.data<T>(),
+                                              0,
+                                              left_mul.data<T>());
+    output_col_vec.device(place) =
+        (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
+  }
+  if (bias.get_ptr()) {
+    auto bias_vec = EigenMatrix<T>::From(*(bias.get_ptr()));
+    Eigen::DSizes<int, 2> bcast(batch_size, 1);
+    output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8df86cc693445306885d765160e19b262b96cb3
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
@@ -0,0 +1,336 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+inline void TransCompute(const int dim,
+                         const Context& dev_ctx,
+                         const DenseTensor& in,
+                         DenseTensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      funcs::Transpose<Context, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      funcs::Transpose<Context, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      funcs::Transpose<Context, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      funcs::Transpose<Context, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      funcs::Transpose<Context, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      funcs::Transpose<Context, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      // for dim >= 7 situation
+      funcs::TransposeNormal<Context, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
+  }
+}
+
+/*! Use these functors to implement tril, triu, diagonal and other operators */
+template <typename T>
+struct EyeFunctor {
+  EyeFunctor(const int m, const int n, T* output)
+      : m_(m), n_(n), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int global_row = index / n_;
+    const int col = index - global_row * n_;
+    const int batch = global_row / m_;
+    const int row = global_row - batch * m_;
+    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
+  }
+
+  const int m_, n_;
+  T* output_;
+};
+
+template <typename T>
+struct MatrixSetDiagFunctor {
+  /*! Overwrite specified diagonals of output by the values in diagonal.
+   * diagonals can be a central band specified by num_diags and
+   * upper_diag_index, where upper_diag_index=0 refers to the main diagonal,
+   * positive value means superdiagonal and negative value means subdiagonal.
+   * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len]
+   * and the num_diags diagonals has a up to down layout. Otherwise it has a
+   * shape [i, j, ..., max_diag_len].
+   */
+  MatrixSetDiagFunctor(const int m,
+                       const int n,
+                       const int num_diags,
+                       const int max_diag_len,
+                       const int upper_diag_index,
+                       const T* diag,
+                       T* output)
+      : m_(m),
+        n_(n),
+        num_diags_(num_diags),
+        max_diag_len_(max_diag_len),
+        upper_diag_index_(upper_diag_index),
+        diag_(diag),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int batch_and_diag_index = index / max_diag_len_;
+    const int index_in_the_diagonal =
+        index - batch_and_diag_index * max_diag_len_;
+    const int batch = batch_and_diag_index / num_diags_;
+    const int diag_index_in_input = batch_and_diag_index - batch * num_diags_;
+    // diag_index=0 refers to the main diagonal
+    const int diag_index = upper_diag_index_ - diag_index_in_input;
+    // shift down for subdiagonal if diag_index < 0
+    const int y_index =
+        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
+    // shift right for superdiagonal if diag_index > 0
+    const int x_index =
+        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
+
+    // Upper-bound checks for diagonals shorter than max_diag_len.
+    // y_index and x_index are nonnegative by construction.
+    if (y_index < m_ && x_index < n_) {
+      const int out_index = batch * m_ * n_ + y_index * n_ + x_index;
+      output_[out_index] = diag_[index];
+    }
+  }
+
+  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
+  const T* diag_;
+  T* output_;
+};
+
+template <typename T>
+struct MatrixDiagPartFunctor {
+  /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0
+   * refers to the main diagonal, positive value means superdiagonal and
+   * negative value means subdiagonal */
+  MatrixDiagPartFunctor(const int m,
+                        const int n,
+                        const int num_diags,
+                        const int max_diag_len,
+                        const int upper_diag_index,
+                        const T padding,
+                        const T* input,
+                        T* output)
+      : m_(m),
+        n_(n),
+        num_diags_(num_diags),
+        max_diag_len_(max_diag_len),
+        upper_diag_index_(upper_diag_index),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int batch_and_mapped_diag_index = index / max_diag_len_;
+    const int index_in_the_diagonal =
+        index - batch_and_mapped_diag_index * max_diag_len_;
+    const int batch = batch_and_mapped_diag_index / num_diags_;
+    const int mapped_diag_index =
+        batch_and_mapped_diag_index - batch * num_diags_;
+    // diag_index=0 refers to the main diagonal
+    const int diag_index = upper_diag_index_ - mapped_diag_index;
+    // shift down for subdiagonal if diag_index < 0
+    const int y_index =
+        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
+    // shift right for superdiagonal if diag_index > 0
+    const int x_index =
+        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
+    if (y_index < m_ && x_index < n_) {
+      output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index];
+    } else {
+      output_[index] = padding_;
+    }
+  }
+
+  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
+  const T padding_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T>
+struct MatrixBandPartScaleEndFunctor {
+  /*! Compared with MatrixBandPartFunctor, it scale up values at the end of
+   * band. It can be used to fuse the following operations, which actually
+   * output triangular with diagonal scaled up:
+   * 1. dig = matrix_diag_part(middle)
+   * 2. middle = matrix_set_diag(middle, diag * scalar)
+   * 3. middle = matrix_band_part(middle, -1, 0)
+   */
+  MatrixBandPartScaleEndFunctor(const int m,
+                                const int n,
+                                const int num_lower_diags,
+                                const int num_upper_diags,
+                                const T scale,
+                                const T* input,
+                                T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        scale_(scale),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = 0;
+    } else if (col == band_end - 1) {
+      output_[index] = scale_ * input_[index];
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const T scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T>
+struct AddtoScaleFunctor {
+  AddtoScaleFunctor(const T scale, const T* input, T* output)
+      : scale_(scale), input_(input), output_(output) {}
+  HOSTDEVICE void operator()(size_t index) const {
+    output_[index] += input_[index];
+    output_[index] *= scale_;
+  }
+  const T scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T, typename Context>
+void CholeskyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out,
+                        const DenseTensor& out_grad,
+                        bool upper,
+                        DenseTensor* x_grad) {
+  auto* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+
+  auto& dims = out.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  auto m = dims[dims.size() - 1];
+  int tensor_size = batch_count * m * m;
+
+  std::vector<int> axis(dims.size() - 2);
+  std::iota(axis.begin(), axis.end(), 0);
+  axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2});
+  DenseTensor l, l_grad;
+  if (upper) {
+    l.Resize(dims);
+    dev_ctx.template Alloc<T>(&l);
+    l_grad.Resize(dims);
+    dev_ctx.template Alloc<T>(&l_grad);
+    TransCompute<Context, T>(dims.size(), dev_ctx, out, &l, axis);
+    TransCompute<Context, T>(dims.size(), dev_ctx, out_grad, &l_grad, axis);
+  } else {
+    l = out;
+    l_grad = out_grad;
+  }
+  auto* l_data = l.data<T>();
+
+  /*！ refer to Iain Murray (2016); arXiv 1602.07527 */
+  /*! phi = matmul(L.transpose(-1, -2), grad) */
+  DenseTensor middle;
+  middle.Resize(dims);
+  auto* middle_data = dev_ctx.template Alloc<T>(&middle);
+  auto trans_desc = funcs::CreateMatrixDescriptor(dims, 0, true);
+  auto no_trans_desc = funcs::CreateMatrixDescriptor(dims, 0, false);
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+  blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0));
+
+  /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */
+  paddle::platform::ForRange<Context> for_range(dev_ctx, tensor_size);
+  MatrixBandPartScaleEndFunctor<T> matrix_band_part_scale_end_functor(
+      m,
+      m,
+      /* num_lower_diags */ m,
+      /* num_upper_diags */ 0,
+      /* scale */ 0.5,
+      middle_data,
+      middle_data);
+  for_range(matrix_band_part_scale_end_functor);
+
+  // Compute inverse by solving the triangular linear system AX = B, where B
+  // is the identity matrix. The matrix X would be overwritten on B
+  DenseTensor identity;
+  identity.Resize(dims);
+  auto* identity_data = dev_ctx.template Alloc<T>(&identity);
+  EyeFunctor<T> eye_functor(m, m, identity_data);
+  for_range(eye_functor);
+  // TODO(guosheng): use trsmBatched for GPU
+  for (int i = 0; i < batch_count; i++) {
+    blas.TRSM(/*side*/ CblasLeft,
+              /*uplo*/ CblasLower,
+              /*trans*/ CblasNoTrans,
+              /*diag*/ CblasNonUnit,
+              /*m*/ m,
+              /*n*/ m,
+              /*alpha*/ T(1),
+              l_data + i * m * m,
+              /*lda*/ m,
+              identity_data + i * m * m,
+              /*ldb*/ m);
+  }
+  DenseTensor& l_inverse = identity;
+
+  /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */
+  DenseTensor middle1;
+  middle1.Resize(dims);
+  dev_ctx.template Alloc<T>(&middle1);
+  blas.MatMul(
+      l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1, T(0));
+  blas.MatMul(
+      middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad, T(0));
+
+  /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */
+  DenseTensor x_grad_trans;
+  x_grad_trans.Resize(dims);
+  auto* x_grad_trans_data = dev_ctx.template Alloc<T>(&x_grad_trans);
+  TransCompute<Context, T>(dims.size(), dev_ctx, *x_grad, &x_grad_trans, axis);
+  AddtoScaleFunctor<T> addto_scale_functor(0.5, x_grad_trans_data, x_grad_data);
+  for_range(addto_scale_functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a10481284b17fbc21865ab8aa3b5ebad4e0a7d95
--- /dev/null
+++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RealGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx) {
+  auto numel = dout.numel();
+  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
+
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::RealToComplexFunctor<T> functor(dout_data, dx_data, numel);
+  for_range(functor);
+}
+
+template <typename T, typename Context>
+void ImagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx) {
+  auto numel = dout.numel();
+  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
+
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ImagToComplexFunctor<T> functor(dout_data, dx_data, numel);
+  for_range(functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index 910a7be965e6b5945f67fb1b743cb00cf2e59b2b..ff5cf86ed2ea240747f70f4410b339a135a49d3a 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -28,9 +28,37 @@ void ConjKernel(const Context& dev_ctx,
   auto* x_data = x.data<T>();
   auto* out_data = dev_ctx.template Alloc<T>(out);
 
-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::ConjFunctor<T> functor(x_data, numel, out_data);
   for_range(functor);
 }
 
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/cross_grad_kernel_impl.h b/paddle/phi/kernels/impl/cross_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..99a79dc15c049d826c2bfb9a50efa866bc1e176d
--- /dev/null
+++ b/paddle/phi/kernels/impl/cross_grad_kernel_impl.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CrossGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     int axis,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  auto& input_x = x;
+  auto& input_y = y;
+  auto& input_out_grad = out_grad;
+  auto* output_x_grad = x_grad;
+  auto* output_y_grad = y_grad;
+  int dim = axis;
+  auto input_x_dims = input_x.dims();
+  if (dim != DDim::kMaxRank) {
+    PADDLE_ENFORCE_EQ(
+        dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()),
+        true,
+        errors::OutOfRange(
+            "Attr(dim) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+            input_x_dims.size(),
+            input_x_dims.size() - 1,
+            dim));
+    if (dim < 0) {
+      dim += input_x_dims.size();
+    }
+
+    PADDLE_ENFORCE_EQ(
+        input_x_dims[dim] == 3,
+        true,
+        errors::InvalidArgument(
+            "Input(X/Y).dims[dim] must be equal to 3. But received: "
+            "Input(X/Y).dims[dim] = [%d].",
+            input_x_dims[dim]));
+  } else {
+    for (auto i = 0; i < input_x_dims.size(); i++) {
+      if (input_x_dims[i] == 3) {
+        dim = i;
+        break;
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        dim == DDim::kMaxRank,
+        false,
+        errors::InvalidArgument("There must be at least one dimension 'd' "
+                                "so that Input(X/Y).dims()[d] is equal to 3. "
+                                "But received: Input(X/Y).dims() == [%s].",
+                                input_x_dims));
+  }
+  auto outer_loops = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_loops *= input_x_dims[i];
+  }
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_x_dims.size(); i++) {
+    slice_size *= input_x_dims[i];
+  }
+
+  std::vector<T> input_x_vec, input_y_vec, input_dout_vec;
+  paddle::framework::TensorToVector(input_x, dev_ctx, &input_x_vec);
+  paddle::framework::TensorToVector(input_y, dev_ctx, &input_y_vec);
+  paddle::framework::TensorToVector(input_out_grad, dev_ctx, &input_dout_vec);
+  std::vector<T> out_dx_vec(output_x_grad->numel());
+  std::vector<T> out_dy_vec(output_y_grad->numel());
+
+  dev_ctx.template Alloc<T>(output_x_grad);
+  dev_ctx.template Alloc<T>(output_y_grad);
+
+  for (auto i = 0; i < outer_loops; i++) {
+    for (auto j = 0; j < 3; j++) {
+      auto dst_pos = (3 * i + j) * slice_size;
+      auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
+      auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
+      for (auto k = 0; k < slice_size; k++) {
+        out_dx_vec[dst_pos + k] =
+            input_dout_vec[in_pos2 + k] * input_y_vec[in_pos1 + k] -
+            input_dout_vec[in_pos1 + k] * input_y_vec[in_pos2 + k];
+        out_dy_vec[dst_pos + k] =
+            input_dout_vec[in_pos1 + k] * input_x_vec[in_pos2 + k] -
+            input_dout_vec[in_pos2 + k] * input_x_vec[in_pos1 + k];
+      }
+    }
+  }
+  paddle::framework::TensorFromVector(out_dx_vec, dev_ctx, output_x_grad);
+  paddle::framework::TensorFromVector(out_dy_vec, dev_ctx, output_y_grad);
+  output_x_grad->Resize(input_x_dims);
+  output_y_grad->Resize(input_x_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cross_kernel_impl.h b/paddle/phi/kernels/impl/cross_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6427d7f87193f2d952d838e1fcafe8b532d08598
--- /dev/null
+++ b/paddle/phi/kernels/impl/cross_kernel_impl.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CrossKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 int axis,
+                 DenseTensor* out) {
+  auto& input_x = x;
+  auto& input_y = y;
+  auto* output = out;
+  int dim = axis;
+
+  auto input_x_dims = input_x.dims();
+  auto input_y_dims = input_y.dims();
+  bool dims_match = phi::funcs::CheckDims(input_x_dims, input_y_dims);
+  PADDLE_ENFORCE_EQ(
+      dims_match,
+      true,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
+                                   "the 'shape' of Input(Y). But received "
+                                   "Input(X).dimensions = [%s], "
+                                   "Input(Y).dimensions = [%s]",
+                                   input_x_dims,
+                                   input_x_dims));
+
+  if (dim != DDim::kMaxRank) {
+    PADDLE_ENFORCE_EQ(
+        dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(dim) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+            input_x_dims.size(),
+            input_x_dims.size() - 1,
+            dim));
+    if (dim < 0) {
+      dim += input_x_dims.size();
+    }
+
+    PADDLE_ENFORCE_EQ(
+        input_x_dims[dim] == 3,
+        true,
+        phi::errors::InvalidArgument(
+            "Input(X/Y).dims[dim] must be equal to 3. But received: "
+            "Input(X/Y).dims[dim] = [%d].",
+            input_x_dims[dim]));
+  } else {
+    for (auto i = 0; i < input_x_dims.size(); i++) {
+      if (input_x_dims[i] == 3) {
+        dim = i;
+        break;
+      }
+    }
+    PADDLE_ENFORCE_EQ(dim == DDim::kMaxRank,
+                      false,
+                      phi::errors::InvalidArgument(
+                          "There must be at least one dimension 'd' so that "
+                          "Input(X/Y).dims()[d] is equal to 3. "
+                          "But received: Input(X/Y).dims() == [%s].",
+                          input_x_dims));
+  }
+  auto outer_loops = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_loops *= input_x_dims[i];
+  }
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_x_dims.size(); i++) {
+    slice_size *= input_x_dims[i];
+  }
+
+  std::vector<T> input_x_vec, input_y_vec;
+  paddle::framework::TensorToVector(input_x, dev_ctx, &input_x_vec);
+  paddle::framework::TensorToVector(input_y, dev_ctx, &input_y_vec);
+  std::vector<T> out_vec(output->numel());
+
+  dev_ctx.template Alloc<T>(output);
+
+  for (auto i = 0; i < outer_loops; i++) {
+    for (auto j = 0; j < 3; j++) {
+      auto dst_pos = (3 * i + j) * slice_size;
+      auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
+      auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
+
+      for (auto k = 0; k < slice_size; k++) {
+        out_vec[dst_pos + k] =
+            input_x_vec[in_pos1 + k] * input_y_vec[in_pos2 + k] -
+            input_x_vec[in_pos2 + k] * input_y_vec[in_pos1 + k];
+      }
+    }
+  }
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, output);
+  output->Resize(input_x_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
index f94fe7168b2a5cb338f5fdc741d9be56b810f7c6..74ded1569eb5804950898bc1b824367b56480cda 100644
--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -47,7 +47,7 @@ void DigammaGradKernel(const Context& ctx,
   auto* x_data = x.data<T>();
   auto* dx_data = x_grad->data<T>();
   auto numel = out_grad.numel();
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   DigammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
   for_range(functor);
 }
diff --git a/paddle/phi/kernels/impl/digamma_kernel_impl.h b/paddle/phi/kernels/impl/digamma_kernel_impl.h
index 5a924a322d6e9941475854dbc01bc4b1d0084bb5..8994979e64d70753ba7b0a6a4debc5e48a95f243 100644
--- a/paddle/phi/kernels/impl/digamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -41,7 +41,7 @@ void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
   auto* x_data = x.data<T>();
   auto* out_data = out->data<T>();
   auto numel = x.numel();
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   DigammaFunctor<T> functor(x_data, out_data, numel);
   for_range(functor);
 }
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index cafcb302d65b9add338851d6ffd6df56158230e0..460e74b58166a5132bdbd62703f4dc3d5ef34a91 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -36,12 +36,12 @@ void AddGradImpl(const Context& dev_ctx,
       x_grad->dims() == out_grad.dims()) {
     VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't "
                "reduce";
-    phi::Copy(dev_ctx, out_grad, false, x_grad);
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
   } else if (x_grad == nullptr && y_grad != nullptr &&
              y_grad->dims() == out_grad.dims()) {
     VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't "
                "reduce";
-    phi::Copy(dev_ctx, out_grad, false, y_grad);
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, y_grad);
   } else {
     grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis);
   }
diff --git a/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae76574e04e71ae722a4e306456a55c56e4464c1
--- /dev/null
+++ b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfinvGradKernel(const Context& ctx,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto eigen_out = EigenVector<T>::Flatten(out);
+  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
+  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *ctx.eigen_device();
+  constexpr T half_sqrt_pi = static_cast<T>(1 / M_2_SQRTPI);
+  eigen_dx.device(place) = half_sqrt_pi * eigen_dout * eigen_out.square().exp();
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/erfinv_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0fb8a01b997186026b0b3c59019e4a32b2bcafe
--- /dev/null
+++ b/paddle/phi/kernels/impl/erfinv_kernel_impl.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  auto eigen_in = EigenVector<T>::Flatten(x);
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto& place = *ctx.eigen_device();
+  constexpr T half = static_cast<T>(0.5);
+  constexpr T half_sqrt = static_cast<T>(M_SQRT1_2);
+  eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
index 889b560dd7398a2bc07f95b8ce607efc0e2372bd..766f91cd22e1f4584708c506b0ef5f742fdc366e 100644
--- a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
@@ -88,7 +88,7 @@ void ExpandGradKernel(const Context& ctx,
   }
   // no need reduce, just copy
   if (just_copy) {
-    phi::Copy(ctx, out_grad, false, in_grad);
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, in_grad);
   } else {
     PADDLE_ENFORCE_GE(dims,
                       1,
diff --git a/paddle/phi/kernels/impl/eye_kernel_impl.h b/paddle/phi/kernels/impl/eye_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..453652273a25b2140376712a673713b2f9fbe12b
--- /dev/null
+++ b/paddle/phi/kernels/impl/eye_kernel_impl.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct EyeFunctor {
+  EyeFunctor(int64_t num_columns, T* output)
+      : num_columns_(num_columns), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    output_[idx * num_columns_ + idx] = static_cast<T>(1);
+  }
+
+  int64_t num_columns_;
+  T* output_;
+};
+
+template <typename T, typename Context>
+void EyeKernel(const Context& ctx,
+               int64_t num_rows,
+               int64_t num_columns,
+               int dtype,
+               DenseTensor* out) {
+  auto num = num_columns;
+  if (num == -1) {
+    num = num_rows;
+  }
+  T* out_data = ctx.template Alloc<T>(out);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(ctx, out, static_cast<T>(0));
+  int64_t num_eyes = (std::min)(num_rows, num);
+  paddle::platform::ForRange<Context> for_range(ctx, num_eyes);
+  EyeFunctor<T> functor(num, out_data);
+  for_range(functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/full_kernel_impl.h b/paddle/phi/kernels/impl/full_kernel_impl.h
deleted file mode 100644
index 40675dd175bef8ca6840264b1a3715363c6c3fb4..0000000000000000000000000000000000000000
--- a/paddle/phi/kernels/impl/full_kernel_impl.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/common/scalar_array.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace phi {
-
-template <typename T, typename Context, typename VType>
-void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) {
-  dev_ctx.template Alloc<T>(tensor);
-  auto t = phi::EigenVector<T>::Flatten(*tensor);
-  t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(val));
-}
-
-template <typename T, typename Context>
-void FullKernel(const Context& dev_ctx,
-                const ScalarArray& shape,
-                const Scalar& val,
-                DenseTensor* out) {
-  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
-  FullValue<T>(dev_ctx, out, val.to<T>());
-}
-
-template <typename T, typename Context>
-void FullLikeKernel(const Context& dev_ctx,
-                    const Scalar& val,
-                    DenseTensor* out) {
-  auto value = val.to<float>();
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, phi::dtype::float16>::value,
-                                float,
-                                T>::type>::type;
-
-  auto common_type_value = static_cast<CommonType>(value);
-
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
-       static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
-      paddle::platform::errors::InvalidArgument(
-          "The filled value is out of range for target type, "
-          "current kernel type is %s, the range should between %f "
-          "and %f, but now value is %f.",
-          typeid(T).name(),
-          static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-          static_cast<CommonType>(std::numeric_limits<T>::max()),
-          static_cast<float>(value)));
-  FullValue<T>(dev_ctx, out, value);
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h b/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d57dd1002ac853093f089f8eaa7f78ac96de078
--- /dev/null
+++ b/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GumbelSoftmaxGradKernel(const Context& ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             int axis,
+                             DenseTensor* dx) {
+  const int rank = dx->dims().size();
+  axis = funcs::CanonicalAxis(axis, rank);
+  int axis_dim = dx->dims()[axis];
+  // allocate memory on device.
+
+  ctx.template Alloc<T>(dx);
+  if (dx->numel() == 0) {
+    return;
+  }
+
+  const int size_to_axis = funcs::SizeToAxis(axis, dx->dims());
+  const int size_from_axis = funcs::SizeFromAxis(axis, dx->dims());
+  DenseTensor dx_2d(*dx), out_2d(out), dout_2d(dout);
+  dx_2d.Resize({size_to_axis, size_from_axis});
+  out_2d.Resize({size_to_axis, size_from_axis});
+  dout_2d.Resize({size_to_axis, size_from_axis});
+  paddle::operators::math::SoftmaxGradFunctor<Context, T>()(
+      ctx, axis_dim, &out_2d, &dout_2d, &dx_2d);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2517d84898727bf07e64edb5960168e9d55e5d70
--- /dev/null
+++ b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <random>
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename Context, typename T, int64_t Rank>
+struct ArgMaxFunctor {
+  void operator()(const Context& ctx,
+                  const DenseTensor& in,
+                  DenseTensor* index_tensor,
+                  const int64_t& axis) {
+    auto in_eigen = EigenTensor<T, Rank>::From(in, in.dims());
+    auto index_eigen = EigenTensor<int, Rank - 1>::From(*index_tensor);
+    index_eigen = in_eigen.argmax(axis).template cast<int>();
+  }
+};
+
+template <typename Context, typename T>
+struct GumbleNoiseGenerator;
+
+template <typename Context, typename T>
+struct OneHotGenerator;
+
+template <typename T, typename Context>
+void GumbelSoftmaxKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         float temperature,
+                         bool hard,
+                         int axis,
+                         DenseTensor* out) {
+  const int rank = x.dims().size();
+  axis = funcs::CanonicalAxis(axis, rank);
+  int axis_dim = x.dims()[axis];
+
+  PADDLE_ENFORCE_GT(temperature,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The temperature must be greater than 0. But "
+                        "received temperature = %f",
+                        temperature));
+
+  // allocate memory on device.
+  ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+
+  const int size_to_axis = funcs::SizeToAxis(axis, x.dims());
+  const int size_from_axis = funcs::SizeFromAxis(axis, x.dims());
+  DenseTensor x_noise_2d, out_2d(*out);
+  x_noise_2d.Resize({size_to_axis, size_from_axis});
+  out_2d.Resize({size_to_axis, size_from_axis});
+
+  // generate gumbel noise and add it to X
+  auto* x_noise_data = ctx.template Alloc<T>(&x_noise_2d);
+  GumbleNoiseGenerator<Context, T>::Transform(ctx,
+                                              x.data<T>(),
+                                              x_noise_data,
+                                              size_to_axis,
+                                              size_from_axis,
+                                              temperature);
+
+#ifdef PADDLE_ON_INFERENCE
+  paddle::operators::math::SoftmaxFunctor<Context, T, true>()(
+      ctx, axis_dim, &x_noise_2d, &out_2d);
+#else
+  paddle::operators::math::SoftmaxFunctor<Context, T, false>()(
+      ctx, axis_dim, &x_noise_2d, &out_2d);
+#endif
+
+  if (hard) {
+    OneHotGenerator<Context, T>::Transform(ctx, x, out, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b93578abba2b72374b54667a8b38665c20c9dc77
--- /dev/null
+++ b/paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct HuberLossBackward {
+  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
+      : sign(sign), delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return sign * val;
+    } else {
+      if (val > 0) {
+        return sign * delta;
+      } else {
+        return -1 * sign * delta;
+      }
+    }
+  }
+
+  T sign;
+  T delta;
+};
+
+template <typename T, typename Context>
+void HuberLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& residual,
+                         const DenseTensor& out_grad,
+                         float delta,
+                         DenseTensor* input_grad,
+                         DenseTensor* label_grad) {
+  T delta_ = static_cast<T>(delta);
+  auto& place = *dev_ctx.eigen_device();
+
+  auto eigen_residual = EigenVector<T>::Flatten(residual);
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+
+  if (input_grad) {
+    dev_ctx.template Alloc<T>(input_grad);
+    auto eigen_input_grad = EigenVector<T>::Flatten(*input_grad);
+    eigen_input_grad.device(place) =
+        eigen_residual.unaryExpr(HuberLossBackward<T>(delta_, -1.0));
+    eigen_input_grad.device(place) = eigen_out_grad * eigen_input_grad;
+  }
+
+  if (label_grad) {
+    dev_ctx.template Alloc<T>(label_grad);
+    auto eigen_label_grad = EigenVector<T>::Flatten(*label_grad);
+    eigen_label_grad.device(place) =
+        eigen_residual.unaryExpr(HuberLossBackward<T>(delta_, 1.0));
+    eigen_label_grad.device(place) = eigen_out_grad * eigen_label_grad;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/huber_loss_kernel_impl.h b/paddle/phi/kernels/impl/huber_loss_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fbdc80c3829bf96ea8d3692095059e52a06b736
--- /dev/null
+++ b/paddle/phi/kernels/impl/huber_loss_kernel_impl.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/huber_loss_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct HuberLossForward {
+  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return static_cast<T>(0.5) * val * val;
+    } else {
+      return delta * (abs_val - static_cast<T>(0.5) * delta);
+    }
+  }
+
+  T delta;
+};
+
+template <typename T, typename Context>
+void HuberLossKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& label,
+                     float delta,
+                     DenseTensor* out,
+                     DenseTensor* residual) {
+  T delta_ = static_cast<T>(delta);
+  auto& place = *dev_ctx.eigen_device();
+
+  auto x = EigenVector<T>::Flatten(input);
+  auto y = EigenVector<T>::Flatten(label);
+
+  dev_ctx.template Alloc<T>(residual);
+  auto eigen_residual = EigenVector<T>::Flatten(*residual);
+  eigen_residual.device(place) = y - x;
+
+  dev_ctx.template Alloc<T>(out);
+  auto loss = EigenVector<T>::Flatten(*out);
+  loss.device(place) = eigen_residual.unaryExpr(HuberLossForward<T>(delta_));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/increment_kernel_impl.h b/paddle/phi/kernels/impl/increment_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0756807a87532812fdd7ff2ad6fc1bd8a125aa26
--- /dev/null
+++ b/paddle/phi/kernels/impl/increment_kernel_impl.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/increment_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IncrementKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     float value,
+                     DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  auto& dev = *dev_ctx.eigen_device();
+  funcs::EigenAdd<std::decay_t<decltype(dev)>, T>::Eval(
+      dev,
+      EigenScalar<T>::From(*out),
+      EigenScalar<T>::From(x),
+      static_cast<T>(value));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 8b94fa1d22eb5254dcb1e92eb7bd98bfe368b4ae..f2549c171dda00ecab0baf8b6a7cdfb26ddea4d0 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -596,7 +596,6 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
         ddout_flag = true;
       }
     }
-
     if (ddy) {
       auto ddy_mat = ddy.get();
       if (ddy_mat.dims() != y_help.dims()) {
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index 119bdc2986ea5559ea818af86b4cc6c1e6efe8a5..f6136de5d8d0c3d04c83b0446abc82d0eeb11376 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -38,7 +38,7 @@ static void GetBroadcastFromDims(const int x_ndim,
     PADDLE_ENFORCE_EQ(
         x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
         true,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(X) and Input(Y) has error dim."
             "X_broadcast's shape[%s] must be equal to Y_broadcast's shape[%s],"
             "or X_broadcast's shape[%s] <= 1, or Y_broadcast's shape[%s] <= 1,"
@@ -110,7 +110,7 @@ void MatMulFunction(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         M,
         N,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "X's numbers must be equal to Y's numbers,"
             "when X/Y's dims =1. But received X has [%d] elements,"
             "received Y has [%d] elements",
@@ -135,27 +135,27 @@ void MatMulFunction(const Context& dev_ctx,
   if (x_ndim == 1) {
     const int N = X.numel();
     if (trans_y) {
-      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(Y) has error dim."
-                            "Y'dims[%d] must be equal to %d"
-                            "But received Y'dims[%d] is %d",
-                            y_ndim - 1,
-                            N,
-                            y_ndim - 1,
-                            y_dims[y_ndim - 1]));
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 1],
+          N,
+          phi::errors::InvalidArgument("Input(Y) has error dim."
+                                       "Y'dims[%d] must be equal to %d"
+                                       "But received Y'dims[%d] is %d",
+                                       y_ndim - 1,
+                                       N,
+                                       y_ndim - 1,
+                                       y_dims[y_ndim - 1]));
     } else {
-      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(Y) has error dim."
-                            "Y'dims[%d] must be equal to %d"
-                            "But received Y'dims[%d] is %d",
-                            y_ndim - 2,
-                            N,
-                            y_ndim - 2,
-                            y_dims[y_ndim - 2]));
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 2],
+          N,
+          phi::errors::InvalidArgument("Input(Y) has error dim."
+                                       "Y'dims[%d] must be equal to %d"
+                                       "But received Y'dims[%d] is %d",
+                                       y_ndim - 2,
+                                       N,
+                                       y_ndim - 2,
+                                       y_dims[y_ndim - 2]));
     }
     std::vector<std::int64_t> out_dims(y_ndim - 1);
     if (trans_y) {
@@ -213,27 +213,27 @@ void MatMulFunction(const Context& dev_ctx,
   if (y_ndim == 1) {
     const int N = Y.numel();
     if (trans_x) {
-      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 2],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(X) has error dim."
-                            "X'dims[%d] must be equal to %d"
-                            "But received X'dims[%d] is %d",
-                            x_ndim - 2,
-                            N,
-                            x_ndim - 2,
-                            x_dims[x_ndim - 2]));
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 2],
+          N,
+          phi::errors::InvalidArgument("Input(X) has error dim."
+                                       "X'dims[%d] must be equal to %d"
+                                       "But received X'dims[%d] is %d",
+                                       x_ndim - 2,
+                                       N,
+                                       x_ndim - 2,
+                                       x_dims[x_ndim - 2]));
     } else {
-      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 1],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(X) has error dim."
-                            "X'dims[%d] must be equal to %d"
-                            "But received X'dims[%d] is %d",
-                            x_ndim - 1,
-                            N,
-                            x_ndim - 1,
-                            x_dims[x_ndim - 1]));
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 1],
+          N,
+          phi::errors::InvalidArgument("Input(X) has error dim."
+                                       "X'dims[%d] must be equal to %d"
+                                       "But received X'dims[%d] is %d",
+                                       x_ndim - 1,
+                                       N,
+                                       x_ndim - 1,
+                                       x_dims[x_ndim - 1]));
     }
     std::vector<std::int64_t> out_dims(x_ndim - 1);
     if (trans_x) {
@@ -292,27 +292,27 @@ void MatMulFunction(const Context& dev_ctx,
   const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
   const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
   if (trans_y) {
-    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1],
-                      K,
-                      paddle::platform::errors::InvalidArgument(
-                          "Input(Y) has error dim."
-                          "Y'dims[%d] must be equal to %d"
-                          "But received Y'dims[%d] is %d",
-                          y_ndim - 1,
-                          K,
-                          y_ndim - 1,
-                          y_dims[y_ndim - 1]));
+    PADDLE_ENFORCE_EQ(
+        y_dims[y_ndim - 1],
+        K,
+        phi::errors::InvalidArgument("Input(Y) has error dim."
+                                     "Y'dims[%d] must be equal to %d"
+                                     "But received Y'dims[%d] is %d",
+                                     y_ndim - 1,
+                                     K,
+                                     y_ndim - 1,
+                                     y_dims[y_ndim - 1]));
   } else {
-    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2],
-                      K,
-                      paddle::platform::errors::InvalidArgument(
-                          "Input(Y) has error dim."
-                          "Y'dims[%d] must be equal to %d"
-                          "But received Y'dims[%d] is %d",
-                          y_ndim - 2,
-                          K,
-                          y_ndim - 2,
-                          y_dims[y_ndim - 2]));
+    PADDLE_ENFORCE_EQ(
+        y_dims[y_ndim - 2],
+        K,
+        phi::errors::InvalidArgument("Input(Y) has error dim."
+                                     "Y'dims[%d] must be equal to %d"
+                                     "But received Y'dims[%d] is %d",
+                                     y_ndim - 2,
+                                     K,
+                                     y_ndim - 2,
+                                     y_dims[y_ndim - 2]));
   }
   const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
   const int ndim = (std::max)(x_ndim, y_ndim);
@@ -493,16 +493,16 @@ void MatmulKernel(const Context& dev_ctx,
                   bool transpose_x,
                   bool transpose_y,
                   DenseTensor* out) {
-  PADDLE_ENFORCE_NE(phi::product(x.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(X) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(phi::product(y.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(Y) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_NE(
+      phi::product(x.dims()),
+      0,
+      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
+                                   " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_NE(
+      phi::product(y.dims()),
+      0,
+      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
+                                   " but reviced dims size is 0. "));
   MatMulFunction<Context, T>(dev_ctx, x, y, out, transpose_x, transpose_y);
 }
 
diff --git a/paddle/phi/kernels/impl/mv_kernel_impl.h b/paddle/phi/kernels/impl/mv_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1754ea323ceb959c721f5e4c27058a652e1575c1
--- /dev/null
+++ b/paddle/phi/kernels/impl/mv_kernel_impl.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MvKernel(const Context& dev_ctx,
+              const DenseTensor& x,
+              const DenseTensor& vec,
+              DenseTensor* out) {
+  auto dim_x = x.dims();
+
+  // get data ptr
+  const T* x_data = x.data<T>();
+  const T* vec_data = vec.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  blas.GEMV(false,
+            dim_x[0],
+            dim_x[1],
+            static_cast<T>(1),
+            x_data,
+            vec_data,
+            static_cast<T>(0),
+            out_data);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..db19a04337932a88388da9cdeb32abc0c4fc0466
--- /dev/null
+++ b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelShuffleGradKernel(const Context& ctx,
+                            const DenseTensor& out_grad,
+                            int upscale_factor,
+                            const std::string& data_format,
+                            DenseTensor* x_grad) {
+  auto* dout = &out_grad;
+  auto* dx = x_grad;
+  ctx.template Alloc<T>(dx);
+  int factor = upscale_factor;
+  bool channel_last = (data_format == "NHWC");
+  auto do_dims = dout->dims();
+  auto dx_dims = dx->dims();
+
+  DenseTensor t(*dout);
+  if (!channel_last) {
+    t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
+  } else {
+    t.Resize({do_dims[0], dx_dims[1], factor, dx_dims[2], factor, do_dims[3]});
+  }
+  std::vector<int> axis = {0, 1, 3, 5, 2, 4};
+
+  DenseTensor o(*dx);
+  if (!channel_last) {
+    o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
+  } else {
+    o.Resize({do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor});
+  }
+  phi::funcs::Transpose<Context, T, 6> trans;
+  trans(ctx, t, &o, axis);
+  dx->Resize(dx_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2303db4ea57d6833cd70894e0dd0842e00585a8e
--- /dev/null
+++ b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelShuffleKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        int upscale_factor,
+                        const std::string& data_format,
+                        DenseTensor* out) {
+  auto* in = &x;
+  ctx.template Alloc<T>(out);
+  int factor = upscale_factor;
+  bool channel_last = (data_format == "NHWC");
+  auto in_dims = in->dims();
+  auto o_dims = out->dims();
+
+  DenseTensor t(*in);
+  if (!channel_last) {
+    t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
+  } else {
+    t.Resize({in_dims[0], in_dims[1], in_dims[2], o_dims[3], factor, factor});
+  }
+  std::vector<int> axis = {0, 1, 4, 2, 5, 3};
+
+  DenseTensor o(*out);
+  if (!channel_last) {
+    o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
+  } else {
+    o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]});
+  }
+  phi::funcs::Transpose<Context, T, 6> trans;
+  trans(ctx, t, &o, axis);
+  out->Resize(o_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e82cccac3422e882625b92afdf84895ae43a716
--- /dev/null
+++ b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/poisson_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(ctx, x_grad, static_cast<T>(0));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/size_kernel_impl.h b/paddle/phi/kernels/impl/size_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b781dba3ad2365de3c0f6ba52a746243300e573
--- /dev/null
+++ b/paddle/phi/kernels/impl/size_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SizeKernel(const Context& ctx,
+                const DenseTensor& input,
+                DenseTensor* out) {
+  auto place = ctx.GetPlace();
+  auto out_data = ctx.template Alloc<int64_t>(out);
+  auto cpu_place = phi::CPUPlace();
+  if (place == cpu_place) {
+    out_data[0] = input.numel();
+  } else {
+    DenseTensor cpu_tensor;
+    cpu_tensor.Resize(out->dims());
+    auto cpu_data = ctx.template HostAlloc<int64_t>(&cpu_tensor);
+    cpu_data[0] = input.numel();
+    phi::Copy(ctx, cpu_tensor, place, false, out);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..915bf16a92df183a1f6321361bfc0c5c7fc394b1
--- /dev/null
+++ b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       DenseTensor* x_grad) {
+  const int rank = x_grad->dims().size();
+  const int calc_axis = phi::funcs::CanonicalAxis(axis, rank);
+  int axis_dim = x_grad->dims()[calc_axis];
+
+  // allocate memory on device.
+  dev_ctx.template Alloc<T>(x_grad);
+  if (x_grad->numel() == 0) {
+    return;
+  }
+
+  const int n = phi::funcs::SizeToAxis(calc_axis, x_grad->dims());
+  const int d = phi::funcs::SizeFromAxis(calc_axis, x_grad->dims());
+  DenseTensor dX_2d, Out_2d, dOut_2d;
+  dX_2d.ShareDataWith(*x_grad).Resize({n, d});
+  Out_2d.ShareDataWith(out).Resize({n, d});
+  dOut_2d.ShareDataWith(out_grad).Resize({n, d});
+
+  paddle::operators::math::SoftmaxGradFunctor<Context, T>()(
+      dev_ctx, axis_dim, &Out_2d, &dOut_2d, &dX_2d);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6552f6ed581f45008f01c02fad3c007bf3664942
--- /dev/null
+++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out) {
+  const int rank = x.dims().size();
+  const int calc_axis = phi::funcs::CanonicalAxis(axis, rank);
+  int axis_dim = x.dims()[calc_axis];
+
+  // allocate memory on device.
+  dev_ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+
+  const int n = phi::funcs::SizeToAxis(calc_axis, x.dims());
+  const int d = phi::funcs::SizeFromAxis(calc_axis, x.dims());
+  DenseTensor X_2d, Out_2d;
+  X_2d.ShareDataWith(x).Resize({n, d});
+  Out_2d.ShareDataWith(*out).Resize({n, d});
+  paddle::operators::math::SoftmaxFunctor<Context, T, false>()(
+      dev_ctx, axis_dim, &X_2d, &Out_2d);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
index 5263f92cb578b9cda612e7bfa4edb2b425876b20..b0878d779462a9c351caa038af2ac017bbf4a14f 100644
--- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
@@ -21,7 +21,7 @@
 
 #include <algorithm>
 
-#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
@@ -130,7 +130,7 @@ void TraceGradKernel(const Context& ctx,
     const auto* input_arr = input_stride.Get();
 #endif
 
-    paddle::platform::ForRange<Context> for_range(ctx, in_grad->numel());
+    phi::funcs::ForRange<Context> for_range(ctx, in_grad->numel());
     TraceGradFunctor<T> functor(out_data,
                                 output_arr,
                                 input_arr,
diff --git a/paddle/phi/kernels/impl/unbind_kernel_impl.h b/paddle/phi/kernels/impl/unbind_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e233a2038e48098d8c78bf81d922a812a87187a
--- /dev/null
+++ b/paddle/phi/kernels/impl/unbind_kernel_impl.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/unbind_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnbindKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  std::vector<DenseTensor*> outs) {
+  auto x_dims = x.dims();
+  axis = axis < 0 ? x_dims.size() + axis : axis;
+
+  std::vector<const DenseTensor*> shape_refer;
+  for (size_t j = 0; j < outs.size(); ++j) {
+    dev_ctx.template Alloc<T>(outs[j]);
+    shape_refer.emplace_back(outs[j]);
+  }
+
+  phi::funcs::SplitFunctor<Context, T> functor;
+  functor(dev_ctx, x, shape_refer, axis, &outs);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5556654ee7c0d4de3908e68055ceb986cff8b93e
--- /dev/null
+++ b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unfold_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnfoldGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int>& kernel_sizes,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+
+  if (!x_grad) return;
+
+  auto x_dims = x_grad->dims();
+  const int batch_size = static_cast<int>(x_dims[0]);
+
+  int out_height = phi::funcs::CalcOutputSize(x_dims[2],
+                                              kernel_sizes[0],
+                                              dilations[0],
+                                              paddings[0],
+                                              paddings[2],
+                                              strides[0]);
+  int out_width = phi::funcs::CalcOutputSize(x_dims[3],
+                                             kernel_sizes[1],
+                                             dilations[1],
+                                             paddings[1],
+                                             paddings[3],
+                                             strides[1]);
+
+  DDim x_shape = make_ddim({x_dims[1], x_dims[2], x_dims[3]});
+  DDim out_matrix_shape = make_ddim(
+      {x_dims[1], kernel_sizes[0], kernel_sizes[1], out_height, out_width});
+
+  paddle::operators::math::
+      Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          col2im;
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(ctx, x_grad, static_cast<T>(0));
+  for (int i = 0; i < batch_size; i++) {
+    DenseTensor out_grad_batch =
+        out_grad.Slice(i, i + 1).Resize(out_matrix_shape);
+    DenseTensor x_grad_batch = x_grad->Slice(i, i + 1).Resize(x_shape);
+    col2im(ctx, out_grad_batch, dilations, strides, paddings, &x_grad_batch);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/unfold_kernel_impl.h b/paddle/phi/kernels/impl/unfold_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e914f6cacbde97de310fa685cbc90ce9770f6ade
--- /dev/null
+++ b/paddle/phi/kernels/impl/unfold_kernel_impl.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unfold_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnfoldKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_sizes,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  DenseTensor* out) {
+  const int batch_size = static_cast<int>(x.dims()[0]);
+  ctx.template Alloc<T>(out);
+
+  paddle::operators::math::
+      Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          im2col;
+  auto x_dims = x.dims();
+
+  int out_height = phi::funcs::CalcOutputSize(x_dims[2],
+                                              kernel_sizes[0],
+                                              dilations[0],
+                                              paddings[0],
+                                              paddings[2],
+                                              strides[0]);
+  int out_width = phi::funcs::CalcOutputSize(x_dims[3],
+                                             kernel_sizes[1],
+                                             dilations[1],
+                                             paddings[1],
+                                             paddings[3],
+                                             strides[1]);
+
+  DDim x_shape = make_ddim({x_dims[1], x_dims[2], x_dims[3]});
+  DDim out_matrix_shape = make_ddim(
+      {x_dims[1], kernel_sizes[0], kernel_sizes[1], out_height, out_width});
+
+  for (int i = 0; i < batch_size; i++) {
+    DenseTensor in_batch = x.Slice(i, i + 1).Resize(x_shape);
+    DenseTensor out_batch = out->Slice(i, i + 1).Resize(out_matrix_shape);
+    im2col(ctx, in_batch, dilations, strides, paddings, &out_batch);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/increment_kernel.h b/paddle/phi/kernels/increment_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c5bc2a20279106905edca0209f325cf2c7f1e78
--- /dev/null
+++ b/paddle/phi/kernels/increment_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IncrementKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     float value,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/label_smooth_grad_kernel.h b/paddle/phi/kernels/label_smooth_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..993e967814aee4a23511d03698752412134ea590
--- /dev/null
+++ b/paddle/phi/kernels/label_smooth_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LabelSmoothGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           float epsilon,
+                           DenseTensor* label_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/label_smooth_kernel.h b/paddle/phi/kernels/label_smooth_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7e1f2708894cd9cb730b3714af8d471fa6cfd08
--- /dev/null
+++ b/paddle/phi/kernels/label_smooth_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LabelSmoothKernel(const Context& ctx,
+                       const DenseTensor& label,
+                       paddle::optional<const DenseTensor&> prior_dist,
+                       float epsilon,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/masked_select_grad_kernel.h b/paddle/phi/kernels/masked_select_grad_kernel.h
index fd16091a665ca983cd5185eb7e12a2928052794a..f9db1fcd2acc7a7924d9b9e393550a74d0d0ac81 100644
--- a/paddle/phi/kernels/masked_select_grad_kernel.h
+++ b/paddle/phi/kernels/masked_select_grad_kernel.h
@@ -24,4 +24,4 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                             const DenseTensor& mask,
                             DenseTensor* x_grad);
 
-}  // namspace pten
+}  // namspace phi
diff --git a/paddle/phi/kernels/masked_select_kernel.h b/paddle/phi/kernels/masked_select_kernel.h
index abd3c318986d81cb14c0f8ecdd449faf1b48cf3a..471f650690d367da132e0ad2e8da441394b7aff2 100644
--- a/paddle/phi/kernels/masked_select_kernel.h
+++ b/paddle/phi/kernels/masked_select_kernel.h
@@ -23,4 +23,4 @@ void MaskedSelectKernel(const Context& dev_ctx,
                         const DenseTensor& mask,
                         DenseTensor* out);
 
-}  // namspace pten
+}  // namspace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index e1e3679ea8be89ef438b59e70889e23adc360ea0..3cb7b66ddf73e5fa3c5502a4acaad2c277a22ac6 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -81,10 +81,10 @@ void MultiplyKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
 
-PT_REGISTER_KERNEL(sum,
+PD_REGISTER_KERNEL(sum,
                    CPU,
                    ALL_LAYOUT,
                    phi::SumKernel,
@@ -100,7 +100,7 @@ PT_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 
-PT_REGISTER_KERNEL(add,
+PD_REGISTER_KERNEL(add,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddKernel,
@@ -111,7 +111,7 @@ PT_REGISTER_KERNEL(add,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PD_REGISTER_KERNEL(subtract,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractKernel,
@@ -121,8 +121,9 @@ PT_REGISTER_KERNEL(subtract,
                    int,
                    int64_t,
                    complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(divide,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(divide,
                    CPU,
                    ALL_LAYOUT,
                    phi::DivideKernel,
@@ -132,7 +133,7 @@ PT_REGISTER_KERNEL(divide,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PD_REGISTER_KERNEL(multiply,
                    CPU,
                    ALL_LAYOUT,
                    phi::MultiplyKernel,
@@ -142,10 +143,11 @@ PT_REGISTER_KERNEL(multiply,
                    int64_t,
                    bool,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   phi::dtype::bfloat16) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(mean,
+PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
                    phi::MeanKernel,
@@ -155,7 +157,7 @@ PT_REGISTER_KERNEL(mean,
                    int,
                    int64_t,
                    phi::dtype::float16) {}
-PT_REGISTER_KERNEL(sum,
+PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
                    phi::SumKernel,
@@ -170,7 +172,7 @@ PT_REGISTER_KERNEL(sum,
                    complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
-PT_REGISTER_KERNEL(add,
+PD_REGISTER_KERNEL(add,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddKernel,
@@ -180,9 +182,10 @@ PT_REGISTER_KERNEL(add,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PD_REGISTER_KERNEL(subtract,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractKernel,
@@ -194,7 +197,7 @@ PT_REGISTER_KERNEL(subtract,
                    phi::dtype::float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide,
+PD_REGISTER_KERNEL(divide,
                    GPU,
                    ALL_LAYOUT,
                    phi::DivideKernel,
@@ -205,7 +208,7 @@ PT_REGISTER_KERNEL(divide,
                    phi::dtype::float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PD_REGISTER_KERNEL(multiply,
                    GPU,
                    ALL_LAYOUT,
                    phi::MultiplyKernel,
diff --git a/paddle/phi/kernels/multinomial_kernel.h b/paddle/phi/kernels/multinomial_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3d8770bc1b606cb13e23256b1d9dabd7f172df0
--- /dev/null
+++ b/paddle/phi/kernels/multinomial_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultinomialKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int num_samples,
+                       bool replacement,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mv_grad_kernel.h b/paddle/phi/kernels/mv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..edc73d89367ff9d074bbbaef5af38f8222e57de9
--- /dev/null
+++ b/paddle/phi/kernels/mv_grad_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// dX = | dOut vec^T
+// dVec = | X^T dOut
+template <typename T, typename Context>
+void MvGradKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& vec,
+                  const DenseTensor& out_grad,
+                  DenseTensor* x_grad,
+                  DenseTensor* vec_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mv_kernel.h b/paddle/phi/kernels/mv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab4f0b82794ab3ba154c072496cb897fc1416b84
--- /dev/null
+++ b/paddle/phi/kernels/mv_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MvKernel(const Context& ctx,
+              const DenseTensor& x,
+              const DenseTensor& vec,
+              DenseTensor* out);
+
+}  // namepsace phi
diff --git a/paddle/phi/kernels/pixel_shuffle_grad_kernel.h b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..be57de5da40530e8aa96aff23638e8f6613b5f0a
--- /dev/null
+++ b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelShuffleGradKernel(const Context& ctx,
+                            const DenseTensor& out_grad,
+                            int upscale_factor,
+                            const std::string& data_format,
+                            DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pixel_shuffle_kernel.h b/paddle/phi/kernels/pixel_shuffle_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..18b9ab9c21fdc7fbccbfe8d15152e09006a34e37
--- /dev/null
+++ b/paddle/phi/kernels/pixel_shuffle_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelShuffleKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        int upscale_factor,
+                        const std::string& data_format,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/poisson_grad_kernel.h b/paddle/phi/kernels/poisson_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..21720474f4a12b2cd624c97cefaf0fe395a458f4
--- /dev/null
+++ b/paddle/phi/kernels/poisson_grad_kernel.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/poisson_kernel.h b/paddle/phi/kernels/poisson_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f67c9c46311d16c2ec29393c6f957c0cb3371fd2
--- /dev/null
+++ b/paddle/phi/kernels/poisson_kernel.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index a9146c8aa58959bdbd6c994ffc2d9a3b08b55099..4f3c069f3b249161eb83698c4ded150b8f003b14 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -110,7 +110,11 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
     T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride);
     val = reducer(val, temp);
   }
-  return val;
+  if (threadIdx.x == 0) {
+    shared[threadIdx.y] = val;
+  }
+  __syncthreads();
+  return shared[threadIdx.y];
 }
 
 /**
@@ -132,6 +136,40 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
   return shared_memory[threadIdx.x];
 }
 
+// Swap data
+template <typename T>
+__device__ __forceinline__ void Swap(T* first_value, T* second_value) {
+  T t_value;
+  t_value = (*first_value);
+  (*first_value) = (*second_value);
+  (*second_value) = t_value;
+}
+
+// swap with monotonic_type
+template <typename T>
+__device__ __forceinline__ void Comparator(T* first_value,
+                                           T* second_value,
+                                           int monotonic_type) {
+  if (((*first_value) > (*second_value)) == monotonic_type) {
+    Swap<T>(first_value, second_value);
+  }
+}
+
+template <typename T, typename IndexType>
+__device__ __forceinline__ void ComparatorWithIndex(T* first_value,
+
+                                                    T* second_value,
+                                                    IndexType* first_index,
+                                                    IndexType* second_index,
+                                                    int monotonic_type) {
+  if ((*first_value > (*second_value)) == monotonic_type) {
+    // swap value
+    Swap<T>(first_value, second_value);
+    // swap index
+    Swap<IndexType>(first_index, second_index);
+  }
+}
+
 }  // namespace details
 
 /**
@@ -481,5 +519,94 @@ __device__ __forceinline__ void Cumsum(OutT* out,
       static_cast<OutT>(temp[tidx + shared_size + (tidx + shared_size) / 32]);
 }
 
+#define SHARED_SIZE_LIMIT \
+  1024  // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
+        // larger than blockDim.x * 2
+// if monotonic_type = 1 then increase
+// if gridDim.x > 1 please set monotonic_type = blockIdx.x & 1; blockIdx.x % 2
+// == 1 the increase
+template <typename T>
+__device__ __forceinline__ void Sort(T* dst,
+                                     const T* src_data,
+                                     int num,
+                                     int monotonic_type) {
+  // todo: set  num = Pow2(num)
+  // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
+  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
+                                          // blockDim * 2
+  // Copy value and index from src and src_index
+  value[threadIdx.x] = src_data[0];
+  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  // make bitonicSort
+  for (int size = 2; size < num; size <<= 1) {
+    int bitonic_type = (threadIdx.x & (size / 2)) != 0;
+    for (int stride = size / 2; stride > 0; stride >>= 1) {
+      __syncthreads();
+      int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      details::Comparator<T>(&value[pos], &value[pos + stride], bitonic_type);
+    }
+  }
+  // last sort
+  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+    __syncthreads();
+    int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    // last sort when monotonic_type = 1 then increase
+    details::Comparator<T>(&value[pos], &value[pos + stride], monotonic_type);
+  }
+  __syncthreads();
+  dst[0] = value[threadIdx.x];
+  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+}
+
+template <typename T, typename IndexType>
+__device__ __forceinline__ void Sort(T* dst,
+                                     IndexType* dst_index,
+                                     const T* src_data,
+                                     IndexType* src_index,
+                                     int num,
+                                     int monotonic_type) {
+  // todo: set  num = Pow2(num)
+  // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
+  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
+                                          // blockDim * 2
+  __shared__ IndexType index[SHARED_SIZE_LIMIT];
+  // Copy value and index from src and src_index
+  value[threadIdx.x] = src_data[0];
+  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  // index
+  index[threadIdx.x] = src_index[0];
+  index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_index[1];
+  // make bitonicSort
+  for (int size = 2; size < num; size <<= 1) {
+    int bitonic_type = (threadIdx.x & (size / 2)) != 0;
+    for (int stride = size / 2; stride > 0; stride >>= 1) {
+      __syncthreads();
+      int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      details::ComparatorWithIndex<T, IndexType>(&value[pos],
+                                                 &value[pos + stride],
+                                                 &index[pos],
+                                                 &index[pos + stride],
+                                                 bitonic_type);
+    }
+  }
+
+  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+    __syncthreads();
+    int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    // last sort when monotonic_type = 1 then increase
+    details::ComparatorWithIndex<T, IndexType>(&value[pos],
+                                               &value[pos + stride],
+                                               &index[pos],
+                                               &index[pos + stride],
+                                               monotonic_type);
+  }
+
+  __syncthreads();
+  dst[0] = value[threadIdx.x];
+  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  dst_index[0] = index[threadIdx.x];
+  dst_index[1] = index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+}
+
 }  // namespace kps
 }  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index cd044e8a3f995919c3bb7fab57c6f1cdffa11040..a445f4a02ea714b2b2851d4de178b5ba76f5678d 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -328,7 +328,7 @@ __device__ __forceinline__ void Reduce(T* out,
                                        const T* in,
                                        ReduceFunctor reducer,
                                        bool reduce_last_dim) {
-  if (Mode == kGlobalMode) {
+  if (Mode == details::kGlobalMode) {
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
 #pragma unroll
@@ -336,7 +336,7 @@ __device__ __forceinline__ void Reduce(T* out,
         out[i] = reducer(out[i], in[i * NX + j]);
       }
     }
-    BlockXReduce<T, OpFunc, NY>(out, reducer);
+    BlockXReduce<T, ReduceFunctor, NY>(out, reducer);
   } else {  // else  kLocalMode
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 120be251db2c80abf8975739e655eaed4f3425c3..a6c4c40a7505e11f9b2d971fdb48c4bc94ab5aa7 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -714,5 +714,14 @@ __device__ __forceinline__ void ReadDataBc(
   }
 }
 
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
+  int thread_offset = block_offset + threadIdx.x * NX;
+#pragma unroll
+  for (int nx = 0; nx < NX; ++nx) {
+    dst[nx] = static_cast<T>(thread_offset + nx);
+  }
+}
+
 }  // namespace kps
 }  // namespace phi
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 1583774369a97d21aa0ef06f0ec70878720c0b66..75b2dbaf7e6a305fdb32ae3738944922fb4a93a5 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -34,9 +34,9 @@ struct alignas(sizeof(T) * VecSize) VectorType {
 #pragma pack(4)
 template <int kDims>
 struct BroadcastConfig {
-  int strides_in[DDim::kMaxRank];
-  int strides_out[DDim::kMaxRank];
-  int in_dim[DDim::kMaxRank];
+  int strides_in[phi::DDim::kMaxRank];
+  int strides_out[phi::DDim::kMaxRank];
+  int in_dim[phi::DDim::kMaxRank];
 
   HOSTDEVICE BroadcastConfig() {}
 
@@ -222,7 +222,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
  * src: The data pointer of the current block.
  * size: The current block needs to load size data continuously.
  */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
 __device__ __inline__ void ReadData(T* dst,
                                     const T _global_ptr_* src,
                                     int num) {
@@ -251,9 +251,9 @@ template <typename T,
           int BlockSize,
           typename ArgsT,
           int Index,
-          bool IsBoundary = false>
+          bool IsBoundary>
 __device__ __forceinline__ void ReadData(ArgsT* dst,
-                                         const T* __restrict__ src,
+                                         const T _global_ptr_* src,
                                          int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
@@ -366,22 +366,25 @@ __device__ __inline__ void ReadDataBc(T* dst,
  * reduce_last_dim: Used to indicate whether the dimension of reduce contains
  * the lowest dimension.
  */
-template <typename T,
+template <typename Tx,
+          typename Ty,
           int NX,
           int NY,
           int BlockSize,
           int Rank,
           typename IndexCal,
+          typename Functor,
           bool IsBoundary = false>
-__device__ __inline__ void ReadDataReduce(T* dst,
-                                          const T _global_ptr_* src,
-                                          int block_offset,
-                                          const IndexCal& index_cal,
-                                          int size_nx,
-                                          int size_ny,
-                                          int stride_nx,
-                                          int stride_ny,
-                                          bool reduce_last_dim) {
+__device__ __forceinline__ void ReadDataReduce(Ty* dst,
+                                               const Tx* __restrict__ src,
+                                               int block_offset,
+                                               const IndexCal& index_cal,
+                                               int size_nx,
+                                               int size_ny,
+                                               int stride_nx,
+                                               int stride_ny,
+                                               Functor func,
+                                               bool reduce_last_dim) {
   __local__ Tx in_temp[1];
   int thread_offset = 0;
   int left_idx = 0;
@@ -618,11 +621,12 @@ template <typename T,
           int BlockSize,
           int Rank,
           bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(T* dst,
-                                      const T _global_ptr_* src,
-                                      uint32_t block_offset,
-                                      details::BroadcastConfig<Rank> config,
-                                      int total_num_output) {
+__device__ __inline__ void ReadDataBc(
+    T* dst,
+    const T _global_ptr_* src,
+    uint32_t block_offset,
+    const details::BroadcastConfig<Rank>& config,
+    int total_num_output) {
   int thread_offset = block_offset + core_id() * NX;
   int index_src = 0;
 
diff --git a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
new file mode 100755
index 0000000000000000000000000000000000000000..8a21e61eaa7d0a7c354607445c30cdc16d2f3041
--- /dev/null
+++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
@@ -0,0 +1,209 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+
+namespace phi {
+namespace kps {
+/**
+ * @brief Default unary identity functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct IdentityFunctor {
+  inline IdentityFunctor() {}
+
+  explicit inline IdentityFunctor(int n) {}
+
+  inline Ty operator()(const Tx& x) const { return static_cast<Ty>(x); }
+  __device__ inline IdentityFunctor() {}
+
+  __device__ explicit inline IdentityFunctor(int n) {}
+
+  __device__ inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x);
+  }
+  __device__ inline void SetDiv(int n) {}
+};
+
+/**
+ * @brief Default unary div functor. Divide by a constant
+ */
+template <typename Tx, typename Ty = Tx>
+struct DivideFunctor {
+  inline DivideFunctor() { n_inv = static_cast<Tx>(1.0f); }
+
+  explicit inline DivideFunctor(int n)
+      : n_inv(static_cast<Tx>(((float)1.0) / (static_cast<float>(n)))) {}
+
+  inline Ty operator()(const Tx& x) const { return static_cast<Ty>(x * n_inv); }
+
+  __device__ inline DivideFunctor() { n_inv = static_cast<Tx>(1.0f); }
+
+  __device__ inline DivideFunctor(int n)
+      : n_inv(static_cast<Tx>(((float)1.0) / (static_cast<float>(n)))) {}
+
+  __device__ inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x * n_inv);
+  }
+
+  __device__ inline void SetDiv(int n) {
+    n_inv = static_cast<Tx>(((float)1.0) / (static_cast<float>(n)));
+  }
+
+ private:
+  Tx n_inv;
+};
+
+/**
+ * @brief Default unary square functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct SquareFunctor {
+  HOSTDEVICE inline SquareFunctor() {}
+
+  HOSTDEVICE explicit inline SquareFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x) * static_cast<Ty>(x);
+  }
+};
+
+/****************************** Binary Functor ********************************/
+
+/**
+ * @brief Default binary min functor
+ */
+template <typename T>
+struct MinFunctor {
+  inline T initial() { /*return static_cast<T>(std::numeric_limits<T>::max());*/
+  }
+
+  __device__ T operator()(const T& a, const T& b) const {
+    return (b < a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary max functor
+ */
+template <typename T>
+struct MaxFunctor {
+  inline T initial() {
+    // return static_cast<T>(std::numeric_limits<T>::lowest());
+  }
+
+  __device__ T operator()(const T& a, const T& b) const {
+    return (b > a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct AddFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  __device__ T operator()(const T a, const T b) const { return b + a; }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct MulFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  __device__ T operator()(const T& a, const T& b) const { return b * a; }
+};
+
+/**
+ * @brief Default binary logic or functor
+ */
+template <typename T>
+struct LogicalOrFunctor {
+  inline T initial() { return static_cast<T>(false); }
+
+  __device__ T operator()(const T& a, const T& b) const { return b || a; }
+};
+
+/**
+ * @brief Default binary logic and functor
+ */
+template <typename T>
+struct LogicalAndFunctor {
+  inline T initial() { return static_cast<T>(true); }
+
+  __device__ T operator()(const T& a, const T& b) const { return b && a; }
+};
+
+/**
+ * @brief Default binary sub functor
+ */
+template <typename T>
+struct SubFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
+};
+
+/**
+ * @brief Default binary div functor
+ */
+template <typename T, typename Enable = void>
+struct DivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+};
+
+template <typename T>
+struct DivFunctor<T,
+                  typename std::enable_if<std::is_integral<T>::value>::type> {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    // For int32/int64, need to check whether the divison is zero.
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return a / b;
+  }
+};
+
+/**
+ * @brief Default binary floor divide functor
+ */
+template <typename T>
+struct FloorDivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return static_cast<T>(std::trunc(a / b));
+  }
+};
+
+}  // namespace kps
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/helper_primitives.h b/paddle/phi/kernels/primitive/helper_primitives.h
index 1aeaa2aa100d78725cb58d354c05c336e0ab2744..b0dd8c774f83a50c2bb25c6d18d4af0be928043b 100644
--- a/paddle/phi/kernels/primitive/helper_primitives.h
+++ b/paddle/phi/kernels/primitive/helper_primitives.h
@@ -17,7 +17,7 @@
 namespace phi {
 namespace kps {
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
 struct dim3 {
   int x;
   int y;
diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index d29d58b1fecc7fc096c3b2ac1264fb88fe46db55..830bc1972c49fe8c447e9a13f874841d36a12f2d 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -14,11 +14,7 @@
 
 #pragma once
 #include "paddle/phi/kernels/primitive/helper_primitives.h"
-#ifdef PADDLE_WITH_XPU2
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+#ifdef PADDLE_WITH_XPU_KP
 
 #define KPStream XPUStream
 #define KPDevice phi::XPUContext
@@ -26,6 +22,11 @@
 #define __forceinline__ __inline__
 #define __restrict__
 
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
diff --git a/paddle/phi/kernels/randint_kernel.h b/paddle/phi/kernels/randint_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfefc628614fbc03c484a43f31a7194da15a2bf9
--- /dev/null
+++ b/paddle/phi/kernels/randint_kernel.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandintKernel(const Context& dev_ctx,
+                   int low,
+                   int high,
+                   const ScalarArray& shape,
+                   DataType dtype,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void RandintRawKernel(const Context& dev_ctx,
+                      int low,
+                      int high,
+                      const ScalarArray& shape,
+                      DataType dtype,
+                      int seed,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/randperm_kernel.h b/paddle/phi/kernels/randperm_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..70b95db98bef95f364802ad14a54966dc47d13fe
--- /dev/null
+++ b/paddle/phi/kernels/randperm_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandpermRawKernel(
+    const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out);
+
+template <typename T, typename Context>
+void RandpermKernel(const Context& dev_ctx,
+                    int n,
+                    DataType dtype,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc
index 436813b53e6cd363cd1b79412ca729636e9e8f1b..38132966407dce23a3665d22708820df016deb9c 100644
--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -24,7 +24,7 @@ void ReshapeGradKernel(const Context& dev_ctx,
                        const DenseTensor& out_grad,
                        DenseTensor* x_grad) {
   auto x_dims = x_grad->dims();
-  phi::Copy(dev_ctx, out_grad, false, x_grad);
+  phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
   x_grad->Resize(x_dims);
 }
 
@@ -37,24 +37,24 @@ void ReshapeDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_grad,
                            CPU,
                            ALL_LAYOUT,
                            phi::ReshapeGradKernel<phi::CPUContext>,
                            ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
                            CPU,
                            ALL_LAYOUT,
                            phi::ReshapeDoubleGradKernel<phi::CPUContext>,
                            ALL_DTYPE) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_grad,
                            GPU,
                            ALL_LAYOUT,
                            phi::ReshapeGradKernel<phi::GPUContext>,
                            ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
                            GPU,
                            ALL_LAYOUT,
                            phi::ReshapeDoubleGradKernel<phi::GPUContext>,
@@ -62,12 +62,12 @@ PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_grad,
                            XPU,
                            ALL_LAYOUT,
                            phi::ReshapeGradKernel<phi::XPUContext>,
                            ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
                            XPU,
                            ALL_LAYOUT,
                            phi::ReshapeDoubleGradKernel<phi::XPUContext>,
diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc
index 68d9130850191029c111fcfe42589af5962b60b3..f758d7c70518f067188242fdc9f014b5b414e885 100644
--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -36,7 +36,7 @@ void ReshapeKernel(const Context& dev_ctx,
   // TODO(chenweihang): the output dims are overwrite after copying,
   // here we need to use copy method that only copy data
   auto dims = out->dims();
-  phi::Copy(dev_ctx, x, false, out);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
   out->Resize(dims);
   out->ResetLoD(x.lod());
 }
@@ -52,18 +52,18 @@ void ReshapeWithXShape(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     reshape, CPU, ALL_LAYOUT, phi::ReshapeKernel<phi::CPUContext>, ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+PD_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            CPU,
                            ALL_LAYOUT,
                            phi::ReshapeWithXShape<phi::CPUContext>,
                            ALL_DTYPE) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     reshape, GPU, ALL_LAYOUT, phi::ReshapeKernel<phi::GPUContext>, ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+PD_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            GPU,
                            ALL_LAYOUT,
                            phi::ReshapeWithXShape<phi::GPUContext>,
@@ -71,9 +71,9 @@ PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     reshape, XPU, ALL_LAYOUT, phi::ReshapeKernel<phi::XPUContext>, ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+PD_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            XPU,
                            ALL_LAYOUT,
                            phi::ReshapeWithXShape<phi::XPUContext>,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index baedf899d2b53a599f4f9fbbbcf81c8368276f1f..02231867fdd35cc8db4359fa3cc31d6236229afc 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -36,7 +36,7 @@ void FullSR(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full_sr,
+PD_REGISTER_KERNEL(full_sr,
                    CPU,
                    ALL_LAYOUT,
                    phi::FullSR,
@@ -53,7 +53,7 @@ PT_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<double>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(full_sr,
+PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
                    phi::FullSR,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 67717ed469488d7cb4765b2a31ebfacfc123b266..094b6f4d12022be07910bac68f09d201040b364a 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -38,7 +38,7 @@ void ScaleSR(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale_sr,
+PD_REGISTER_KERNEL(scale_sr,
                    CPU,
                    ALL_LAYOUT,
                    phi::ScaleSR,
@@ -52,7 +52,7 @@ PT_REGISTER_KERNEL(scale_sr,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(scale_sr,
+PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
                    phi::ScaleSR,
diff --git a/paddle/phi/kernels/size_kernel.h b/paddle/phi/kernels/size_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d7a29104db0813f4d4dca340575d0c1a5885d4c
--- /dev/null
+++ b/paddle/phi/kernels/size_kernel.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SizeKernel(const Context& ctx, const DenseTensor& input, DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/softmax_grad_kernel.h b/paddle/phi/kernels/softmax_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ecf65c1f17c789b028b3a0a8ad270cca7aa69d9
--- /dev/null
+++ b/paddle/phi/kernels/softmax_grad_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/softmax_kernel.h b/paddle/phi/kernels/softmax_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca69d652770aacd01191f5c3ca685276f0f2336f
--- /dev/null
+++ b/paddle/phi/kernels/softmax_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void SoftmaxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int axis,
+                   DataType dtype,
+                   DenseTensor* out) {
+  auto cast_x = phi::Cast<T, Context>(dev_ctx, x, dtype);
+  phi::SoftmaxRawKernel<T, Context>(dev_ctx, axis, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..71160a6365dc778e40476af960f21443cac698e5
--- /dev/null
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+struct Dims4D {
+  int dims[4];
+  Dims4D(const int batch, const int x, const int y, const int z) {
+    dims[0] = batch;
+    dims[1] = z;
+    dims[2] = y;
+    dims[3] = x;
+  }
+  HOSTDEVICE const int& operator[](int i) const { return dims[i]; }
+};
+
+// Judge whether the current position x is in (lower, upper)
+inline HOSTDEVICE bool Check(const int& x,
+                             const int& kx,
+                             const int& pad,
+                             const int& stride,
+                             const int dilation,
+                             const int kdim,
+                             const int xdim) {
+  const int lower = x - dilation * kx + pad;
+  const int uper = x + (kdim - kx - 1) * dilation - pad;
+  return (lower >= 0 && lower % stride == 0 && uper < xdim);
+}
+
+// Check whether the current position(x, y, z) is legal:
+// Judge the minimum and maximum values at each latitude
+inline HOSTDEVICE bool Check(const Dims4D& dims,
+                             const Dims4D& kernel_dims,
+                             const Dims4D& paddings,
+                             const Dims4D& dilations,
+                             const Dims4D& strides,
+                             const int x,
+                             const int y,
+                             const int z,
+                             const int kx,
+                             const int ky,
+                             const int kz) {
+  bool x_valid = Check(
+      x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]);
+  bool y_valid = Check(
+      y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]);
+  bool z_valid = Check(
+      z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]);
+  return (x_valid && y_valid && z_valid);
+}
+
+template <typename Dim>
+inline HOSTDEVICE int PointToIndex(const int& batch,
+                                   const int& x,
+                                   const int& y,
+                                   const int& z,
+                                   const Dim& dims) {
+  return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
+         y * dims[3] + x;
+}
+
+template <typename Dim>
+inline HOSTDEVICE void IndexToPoint(
+    const int index, const Dim& dims, int* batch, int* x, int* y, int* z) {
+  int n = index;
+  *x = n % dims[3];
+  n /= dims[3];
+  *y = n % dims[2];
+  n /= dims[2];
+  *z = n % dims[1];
+  n /= dims[1];
+  *batch = n;
+}
+
+inline void GetOutShape(const DDim& x_dims,
+                        const DDim& kernel_dims,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DDim* out_dims) {
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
+  PADDLE_ENFORCE_EQ(kernel_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "the shape of kernel should be (D, H, W, C, OC)"));
+
+  // infer out shape
+  (*out_dims)[0] = x_dims[0];
+  (*out_dims)[4] = kernel_dims[4];
+  for (int i = 1; i < 4; i++) {
+    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
+                      dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) /
+                         strides[i - 1] +
+                     1;
+  }
+}
+
+template <typename T, typename Context>
+void Conv3dKernel(const Context& dev_ctx,
+                  const SparseCooTensor& x,
+                  const DenseTensor& kernel,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const int groups,
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook);
+
+template <typename T, typename Context>
+SparseCooTensor Conv3d(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor kernel,
+                       const std::vector<int>& paddings,
+                       const std::vector<int>& dilations,
+                       const std::vector<int>& strides,
+                       const int groups,
+                       DenseTensor* rulebook) {
+  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  SparseCooTensor coo(indices, values, x.dims());
+  Conv3dKernel<T, Context>(
+      dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
new file mode 100644
index 0000000000000000000000000000000000000000..5803069d927d70947d8bc7c3d6af051d7ea1b81c
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -0,0 +1,181 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <set>
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace sparse {
+
+// such as: kernel(3, 3, 3), kernel_size = 27
+// counter_per_weight: (kernel_size)
+// TODO(zhangkaihuo): optimize performance with multithreading
+template <typename T, typename Context>
+void ProductRuleBook(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const DDim& out_dims,
+                     DenseTensor* rulebook,
+                     DenseTensor* counter_per_kernel) {
+  const auto& kernel_dims = kernel.dims();
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  dev_ctx.Alloc(counter_per_kernel,
+                counter_per_kernel->dtype(),
+                sizeof(int) * counter_per_kernel->numel());
+  int* counter_ptr = counter_per_kernel->data<int>();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  memset(counter_ptr, 0, kernel_size * sizeof(int));
+
+  int rulebook_len = 0;
+  // calc the rulebook_len
+  const auto& x_dims = x.dims();
+  const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]);
+  const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
+  const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]);
+
+  auto f_calc_rulebook = [&](int* rulebook_ptr) {
+    int kernel_index = 0, rulebook_index = 0;
+    for (int kz = 0; kz < kernel_dims[0]; kz++) {
+      for (int ky = 0; ky < kernel_dims[1]; ky++) {
+        for (int kx = 0; kx < kernel_dims[2]; kx++) {
+          for (int64_t i = 0; i < non_zero_num; i++) {
+            int batch = indices_ptr[i];
+            int in_z = indices_ptr[i + non_zero_num];
+            int in_y = indices_ptr[i + 2 * non_zero_num];
+            int in_x = indices_ptr[i + 3 * non_zero_num];
+            int out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0];
+            int out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1];
+            int out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2];
+            if (Check(c_x_dims,
+                      c_kernel_dims,
+                      c_paddings,
+                      c_dilations,
+                      c_strides,
+                      in_x,
+                      in_y,
+                      in_z,
+                      kx,
+                      ky,
+                      kz)) {
+              if (rulebook_ptr == nullptr) {
+                counter_ptr[kernel_index] += 1;
+                ++rulebook_len;
+              } else {
+                rulebook_ptr[rulebook_index] = kernel_index;
+                rulebook_ptr[rulebook_index + rulebook_len] = i;  // in_i
+                rulebook_ptr[rulebook_index + rulebook_len * 2] =
+                    PointToIndex<DDim>(
+                        batch, out_x, out_y, out_z, out_dims);  // out_index
+                ++rulebook_index;
+              }
+            }
+          }
+          ++kernel_index;
+        }
+      }
+    }
+  };
+
+  f_calc_rulebook(nullptr);
+  // alloc the rulebook
+  rulebook->ResizeAndAllocate({3, rulebook_len});
+  dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int));
+  int* rulebook_ptr = rulebook->data<int>();
+  f_calc_rulebook(rulebook_ptr);
+}
+
+template <typename T, typename Context>
+void UpdateRulebookAndOutIndex(const Context& dev_ctx,
+                               const SparseCooTensor& x,
+                               const int kernel_size,
+                               const int out_channels,
+                               const DDim& out_dims,
+                               DenseTensor* rulebook,
+                               SparseCooTensor* out) {
+  std::set<int> out_indexs;
+  int n = rulebook->dims()[1];
+  int* rulebook_ptr = rulebook->data<int>();
+  for (int i = 0; i < n; i++) {
+    out_indexs.insert(rulebook_ptr[i + n * 2]);
+  }
+
+  int out_non_zero_num = out_indexs.size();
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, out_channels}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+  dev_ctx.Alloc(
+      &out_indices, out_indices.dtype(), out_indices.numel() * sizeof(int));
+  int* out_indices_ptr = out_indices.data<int>();
+  int i = 0;
+  for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
+    const int index = *it;
+    int batch, x, y, z;
+    IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
+    out_indices_ptr[i] = batch;
+    out_indices_ptr[i + out_non_zero_num] = z;
+    out_indices_ptr[i + out_non_zero_num * 2] = y;
+    out_indices_ptr[i + out_non_zero_num * 3] = x;
+  }
+  for (i = 0; i < n; i++) {
+    int out_index = rulebook_ptr[i + n * 2];
+    rulebook_ptr[i + n * 2] =
+        std::distance(out_indexs.begin(), out_indexs.find(out_index));
+  }
+
+  out->SetMember(out_indices, out_values, out_dims, true);
+}
+
+template <typename T>
+void Gather(
+    const T* x, const int* indexs, const int n, const int channels, T* out) {
+  for (int i = 0; i < n; i++) {
+    int real_i = indexs[i];
+    memcpy(out + i * channels, x + real_i * channels, channels * sizeof(T));
+  }
+}
+
+template <typename T>
+void Scatter(
+    const T* x, const int* indexs, const int n, const int channels, T* out) {
+  for (int i = 0; i < n; i++) {
+    int real_i = indexs[i];
+    for (int j = 0; j < channels; j++) {
+      out[real_i * channels + j] += x[i * channels + j];
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fdf255bd542e66245b44b2ec906dc207ee51a422
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void Conv3dKernel(const Context& dev_ctx,
+                  const SparseCooTensor& x,
+                  const DenseTensor& kernel,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const int groups,
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook) {
+  // update padding and dilation
+  // Currently, only support x.layout is NDHWC, groups = 1
+  // if x.layout != NDHWC then transpose(x), transpose(weight)
+
+  const auto& x_dims = x.dims();
+  const auto& kernel_dims = kernel.dims();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  DDim out_dims = {1, 1, 1, 1, 1};
+  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+
+  // Second algorithm:
+  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
+  // 1. product rulebook
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  // DenseTensor rulebook = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+
+  ProductRuleBook<T, Context>(dev_ctx,
+                              x,
+                              kernel,
+                              paddings,
+                              dilations,
+                              strides,
+                              out_dims,
+                              rulebook,
+                              &counter_per_kernel);
+
+  UpdateRulebookAndOutIndex<T>(
+      dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out);
+
+  int n = rulebook->dims()[1];
+  const int* counter_ptr = counter_per_kernel.data<int>();
+
+  // 2. gather
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {n, in_channels}, DataLayout::NHWC);
+  DenseTensorMeta out_features_meta(
+      x.dtype(), {n, out_channels}, DataLayout::NHWC);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor out_features =
+      phi::Empty(dev_ctx, std::move(out_features_meta));
+  dev_ctx.Alloc(&in_features, x.dtype(), sizeof(T) * in_features.numel());
+  dev_ctx.Alloc(&out_features, x.dtype(), sizeof(T) * out_features.numel());
+  T* in_features_ptr = in_features.data<T>();
+  T* out_features_ptr = out_features.data<T>();
+
+  Gather<T>(x.non_zero_elements().data<T>(),
+            rulebook->data<int>() + n,
+            n,
+            in_channels,
+            in_features_ptr);
+
+  // 3. call gemm for every werght
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  std::vector<int> offsets(kernel_size + 1);
+  int offset = 0;
+  for (int i = 0; i < kernel_size; i++) {
+    offsets[i] = offset;
+    offset += counter_ptr[i];
+  }
+  offsets[kernel_size] = offset;
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter_ptr[i] <= 0) {
+      continue;
+    }
+
+    // call gemm: (n, in_channels) * (in_channels, out_channels)
+    const int M = counter_ptr[i];
+    const int K = in_channels;   // in_channels
+    const int N = out_channels;  // out_channels
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
+    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    blas.GEMM(CblasNoTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_out_ptr);
+  }
+
+  // 4. scatter
+  dev_ctx.Alloc(out->mutable_non_zero_elements(),
+                out->mutable_non_zero_elements()->dtype(),
+                sizeof(T) * in_features.numel());
+  T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
+  memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
+  Scatter<T>(out_features_ptr,
+             rulebook->data<int>() + n * 2,
+             n,
+             out_channels,
+             out_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    sparse_conv3d, CPU, ALL_LAYOUT, phi::sparse::Conv3dKernel, float, double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 408240b90a98841c0565f3b19469e1e70bbe7a18..ba89135641e0e67daa84cd526d8b389953ef1862 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -41,7 +41,7 @@ inline int64_t GetNonZeroNum(const DenseTensor& dense,
   PADDLE_ENFORCE_GE(
       dims.size(),
       sparse_dim,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "sparse_dim(%d) should be less than or equal to dense.dim(%d)",
           sparse_dim,
           dims.size()));
@@ -161,7 +161,7 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
   bool valid = x_dims.size() == 2 || x_dims.size() == 3;
   PADDLE_ENFORCE_EQ(valid,
                     true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D matrix"));
   const int64_t non_zero_num = x.nnz();
   if (non_zero_num <= 0) return;
@@ -284,7 +284,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PT_REGISTER_KERNEL(dense_to_sparse_coo,
+PD_REGISTER_KERNEL(dense_to_sparse_coo,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCooKernel,
@@ -297,7 +297,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_coo,
+PD_REGISTER_KERNEL(sparse_csr_to_coo,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToCooKernel,
@@ -310,7 +310,7 @@ PT_REGISTER_KERNEL(sparse_csr_to_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_csr,
+PD_REGISTER_KERNEL(sparse_coo_to_csr,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToCsrKernel,
@@ -323,7 +323,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(dense_to_sparse_csr,
+PD_REGISTER_KERNEL(dense_to_sparse_csr,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCsrKernel,
@@ -336,7 +336,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_dense,
+PD_REGISTER_KERNEL(sparse_coo_to_dense,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToDenseKernel,
@@ -349,7 +349,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_dense,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_dense,
+PD_REGISTER_KERNEL(sparse_csr_to_dense,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToDenseKernel,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index ab2be13615e0e0e2b4ebc868c4b1606ae4b8cd8e..2e741111fb1489aef5bdc51de637b77eec9d28a7 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -86,19 +86,6 @@ __global__ void GetNonZeroElementsAndIndices(const T* dense_data,
   }
 }
 
-template <typename Context>
-void GetGpuLaunchConfig1D(const Context& dev_ctx,
-                          const int64_t n,
-                          int* grid_size,
-                          int* block_size) {
-  const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
-  const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
-  *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM
-                                     : (1 << static_cast<int>(std::log2(n)));
-  *grid_size = n / *block_size;
-  *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size;
-}
-
 template <typename T, typename Context>
 void DenseToSparseCooKernel(const Context& dev_ctx,
                             const DenseTensor& x,
@@ -379,7 +366,7 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
   bool valid = x_dims.size() == 2 || x_dims.size() == 3;
   PADDLE_ENFORCE_EQ(valid,
                     true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D matrix"));
   const int64_t non_zero_num = x.nnz();
   if (non_zero_num <= 0) return;
@@ -553,7 +540,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PT_REGISTER_KERNEL(dense_to_sparse_coo,
+PD_REGISTER_KERNEL(dense_to_sparse_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCooKernel,
@@ -566,7 +553,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_coo,
+PD_REGISTER_KERNEL(sparse_csr_to_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToCooKernel,
@@ -579,7 +566,7 @@ PT_REGISTER_KERNEL(sparse_csr_to_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_csr,
+PD_REGISTER_KERNEL(sparse_coo_to_csr,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToCsrKernel,
@@ -592,7 +579,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(dense_to_sparse_csr,
+PD_REGISTER_KERNEL(dense_to_sparse_csr,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCsrKernel,
@@ -605,7 +592,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_dense,
+PD_REGISTER_KERNEL(sparse_coo_to_dense,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToDenseKernel,
@@ -618,7 +605,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_dense,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_dense,
+PD_REGISTER_KERNEL(sparse_csr_to_dense,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToDenseKernel,
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 3d7304653e77b3c45b82ccb7426de56457f14b03..d96d134a26b08a0208122a7ea9a62ce07c033d51 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -40,6 +40,19 @@ inline const DDim InferDenseDims(const DDim& x_dims,
   return values_dims;
 }
 
+template <typename Context>
+inline void GetGpuLaunchConfig1D(const Context& dev_ctx,
+                                 const int64_t n,
+                                 int* grid_size,
+                                 int* block_size) {
+  const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
+  const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+  *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM
+                                     : (1 << static_cast<int>(std::log2(n)));
+  *grid_size = n / *block_size;
+  *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size;
+}
+
 template <typename T, typename Context>
 void DenseToSparseCooKernel(const Context& dev_ctx,
                             const DenseTensor& x,
@@ -97,7 +110,7 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
   bool valid = x_dims.size() == 2 || x_dims.size() == 3;
   PADDLE_ENFORCE_EQ(valid,
                     true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D Tensor."));
   const int64_t sparse_dim = x_dims.size() == 2 ? 2 : 3;
   DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index eb7146487e38b2ca9f64ab27cb420507cb190b96..60df877355b8268efafddfdc2b452617cdadf9df 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -69,7 +69,7 @@ void TransferLayoutKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(pten_transfer_layout,
+PD_REGISTER_GENERAL_KERNEL(phi_transfer_layout,
                            CPU,
                            ALL_LAYOUT,
                            phi::TransferLayoutKernel<phi::CPUContext>,
diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0370cc431fef9cab69861b7f707f65c897e20fa2
--- /dev/null
+++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <random>
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+// reference: https://gist.github.com/lakshayg/d80172fe5ae3c5d2c2aedb53c250320e
+template <typename T>
+T Erfinv(T x) {
+  if (x < -1 || x > 1) {
+    return std::numeric_limits<T>::quiet_NaN();
+  } else if (x == 1.0) {
+    return std::numeric_limits<T>::infinity();
+  } else if (x == -1.0) {
+    return -std::numeric_limits<T>::infinity();
+  }
+
+  const T LN2 = 6.931471805599453094172321214581e-1;
+
+  const T A0 = 1.1975323115670912564578e0;
+  const T A1 = 4.7072688112383978012285e1;
+  const T A2 = 6.9706266534389598238465e2;
+  const T A3 = 4.8548868893843886794648e3;
+  const T A4 = 1.6235862515167575384252e4;
+  const T A5 = 2.3782041382114385731252e4;
+  const T A6 = 1.1819493347062294404278e4;
+  const T A7 = 8.8709406962545514830200e2;
+
+  const T B0 = 1.0000000000000000000e0;
+  const T B1 = 4.2313330701600911252e1;
+  const T B2 = 6.8718700749205790830e2;
+  const T B3 = 5.3941960214247511077e3;
+  const T B4 = 2.1213794301586595867e4;
+  const T B5 = 3.9307895800092710610e4;
+  const T B6 = 2.8729085735721942674e4;
+  const T B7 = 5.2264952788528545610e3;
+
+  const T C0 = 1.42343711074968357734e0;
+  const T C1 = 4.63033784615654529590e0;
+  const T C2 = 5.76949722146069140550e0;
+  const T C3 = 3.64784832476320460504e0;
+  const T C4 = 1.27045825245236838258e0;
+  const T C5 = 2.41780725177450611770e-1;
+  const T C6 = 2.27238449892691845833e-2;
+  const T C7 = 7.74545014278341407640e-4;
+
+  const T D0 = 1.4142135623730950488016887e0;
+  const T D1 = 2.9036514445419946173133295e0;
+  const T D2 = 2.3707661626024532365971225e0;
+  const T D3 = 9.7547832001787427186894837e-1;
+  const T D4 = 2.0945065210512749128288442e-1;
+  const T D5 = 2.1494160384252876777097297e-2;
+  const T D6 = 7.7441459065157709165577218e-4;
+  const T D7 = 1.4859850019840355905497876e-9;
+
+  const T E0 = 6.65790464350110377720e0;
+  const T E1 = 5.46378491116411436990e0;
+  const T E2 = 1.78482653991729133580e0;
+  const T E3 = 2.96560571828504891230e-1;
+  const T E4 = 2.65321895265761230930e-2;
+  const T E5 = 1.24266094738807843860e-3;
+  const T E6 = 2.71155556874348757815e-5;
+  const T E7 = 2.01033439929228813265e-7;
+
+  const T F0 = 1.414213562373095048801689e0;
+  const T F1 = 8.482908416595164588112026e-1;
+  const T F2 = 1.936480946950659106176712e-1;
+  const T F3 = 2.103693768272068968719679e-2;
+  const T F4 = 1.112800997078859844711555e-3;
+  const T F5 = 2.611088405080593625138020e-5;
+  const T F6 = 2.010321207683943062279931e-7;
+  const T F7 = 2.891024605872965461538222e-15;
+
+  T abs_x = abs(x);
+
+  if (abs_x <= 0.85) {
+    T r = 0.180625 - 0.25 * x * x;
+    T num =
+        (((((((A7 * r + A6) * r + A5) * r + A4) * r + A3) * r + A2) * r + A1) *
+             r +
+         A0);
+    T den =
+        (((((((B7 * r + B6) * r + B5) * r + B4) * r + B3) * r + B2) * r + B1) *
+             r +
+         B0);
+    return x * num / den;
+  }
+
+  T r = sqrt(LN2 - log(1.0 - abs_x));
+
+  T num, den;
+  if (r <= 5.0) {
+    r = r - 1.6;
+    num =
+        (((((((C7 * r + C6) * r + C5) * r + C4) * r + C3) * r + C2) * r + C1) *
+             r +
+         C0);
+    den =
+        (((((((D7 * r + D6) * r + D5) * r + D4) * r + D3) * r + D2) * r + D1) *
+             r +
+         D0);
+  } else {
+    r = r - 5.0;
+    num =
+        (((((((E7 * r + E6) * r + E5) * r + E4) * r + E3) * r + E2) * r + E1) *
+             r +
+         E0);
+    den =
+        (((((((F7 * r + F6) * r + F5) * r + F4) * r + F3) * r + F2) * r + F1) *
+             r +
+         F0);
+  }
+
+  if (x < 0) {
+    return -num / den;
+  } else {
+    return num / den;
+  }
+}
+
+template <typename T>
+struct TruncatedNormal {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
+    auto normal_cdf = [](T x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf(-2.0);
+    b_normal_cdf = normal_cdf(2.0);
+  }
+
+  T operator()(T value) const {
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
+  }
+};
+
+template <typename T, typename Context>
+void TruncatedGaussianRandomKernel(const Context& ctx,
+                                   const ScalarArray& shape,
+                                   float mean,
+                                   float std,
+                                   int seed,
+                                   DataType dtype,
+                                   DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unbind_kernel.h b/paddle/phi/kernels/unbind_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..30ee9a15d084e7d33ab6b392592be0c9b8f3789a
--- /dev/null
+++ b/paddle/phi/kernels/unbind_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T, typename Context>
+void UnbindKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  std::vector<DenseTensor*> outs);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unfold_grad_kernel.h b/paddle/phi/kernels/unfold_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6578cf8c650b4ac3da7f94489717b80c28451d56
--- /dev/null
+++ b/paddle/phi/kernels/unfold_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnfoldGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int>& kernel_sizes,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unfold_kernel.h b/paddle/phi/kernels/unfold_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d26805e97869746f1170dee4a08a37c32405d089
--- /dev/null
+++ b/paddle/phi/kernels/unfold_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnfoldKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_sizes,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/xpu/cast_kernel.cc b/paddle/phi/kernels/xpu/cast_kernel.cc
index 0e50306a068c894b720c1a8ef1cef915999781d9..9aa503d58736defa477414df43cd812d75cfca36 100644
--- a/paddle/phi/kernels/xpu/cast_kernel.cc
+++ b/paddle/phi/kernels/xpu/cast_kernel.cc
@@ -86,7 +86,7 @@ void CastKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PT_REGISTER_KERNEL(cast,
+PD_REGISTER_KERNEL(cast,
                    XPU,
                    ALL_LAYOUT,
                    phi::CastKernel,
diff --git a/paddle/phi/kernels/xpu/copy_kernel.cc b/paddle/phi/kernels/xpu/copy_kernel.cc
index 559d110a9e8ad8eae9571dd2986e14420da70fa9..fb931ef18a85668ce49d02dc9730cbf3b1436113 100644
--- a/paddle/phi/kernels/xpu/copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/copy_kernel.cc
@@ -27,12 +27,19 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
           const DenseTensor& src,
+          Place dst_place,
           bool blocking,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
-  auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  void* dst_ptr = nullptr;
+
+  dst->Resize(src.dims());
+  if (paddle::platform::is_cpu_place(dst_place)) {
+    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+  } else {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  }
   const auto& src_place = src.place();
-  const auto& dst_place = dst->place();
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
@@ -43,7 +50,7 @@ void Copy(const Context& dev_ctx,
 
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
-  dst->ResizeAndAllocate(src.dims());
+
   CHECK(dst->layout() == src.layout());
   auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
 
@@ -62,12 +69,12 @@ void Copy(const Context& dev_ctx,
     }
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+    PADDLE_THROW(phi::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 }
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     copy, XPU, ALL_LAYOUT, phi::Copy<phi::XPUContext>, ALL_DTYPE) {}
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index 98810fa9779a4afea4991ecd751f68a140d8c185..574f4e991a260e8ebc250fe3f8461736dc3eb7f8 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -116,7 +116,7 @@ void FullLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full,
+PD_REGISTER_KERNEL(full,
                    XPU,
                    ALL_LAYOUT,
                    phi::FullKernel,
@@ -132,11 +132,13 @@ PT_REGISTER_KERNEL(full,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(full_like,
+PD_REGISTER_KERNEL(full_like,
                    XPU,
                    ALL_LAYOUT,
                    phi::FullLikeKernel,
                    float,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/xpu/scale_kernel.cc b/paddle/phi/kernels/xpu/scale_kernel.cc
index 0814e2d9b322fc94d298d14adde03314e06e734b..b5a07a7a146c3e8d058f8f8d90e2dbc3cd68e7ab 100644
--- a/paddle/phi/kernels/xpu/scale_kernel.cc
+++ b/paddle/phi/kernels/xpu/scale_kernel.cc
@@ -32,13 +32,13 @@ void ScaleKernel(const Context& dev_ctx,
                  DenseTensor* out) {
   out->mutable_data<T>(dev_ctx.GetPlace());
 
-  PADDLE_ENFORCE_EQ(x.dims(),
-                    out->dims(),
-                    paddle::platform::errors::InvalidArgument(
-                        "In and out should have the same dim,"
-                        " expected %s, but got %s.",
-                        x.dims().to_str().c_str(),
-                        out->dims().to_str().c_str()));
+  PADDLE_ENFORCE_EQ(
+      x.dims(),
+      out->dims(),
+      phi::errors::InvalidArgument("In and out should have the same dim,"
+                                   " expected %s, but got %s.",
+                                   x.dims().to_str().c_str(),
+                                   out->dims().to_str().c_str()));
   using XPUType = typename XPUTypeTrait<T>::Type;
   int r = xpu::scale(dev_ctx.x_context(),
                      reinterpret_cast<const XPUType*>(x.data<T>()),
@@ -50,13 +50,13 @@ void ScaleKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_EQ(
       r,
       XPU_SUCCESS,
-      paddle::platform::errors::External(
+      phi::errors::External(
           "XPU scale kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
 }
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale,
+PD_REGISTER_KERNEL(scale,
                    XPU,
                    ALL_LAYOUT,
                    phi::ScaleKernel,
diff --git a/paddle/phi/ops/compat/abs_sig.cc b/paddle/phi/ops/compat/abs_sig.cc
index 67319a18aafa1d5012603ccf162a124cc0260733..b4b94457e6be9f15ffbecad64cd9189c3e2c3b08 100644
--- a/paddle/phi/ops/compat/abs_sig.cc
+++ b/paddle/phi/ops/compat/abs_sig.cc
@@ -32,7 +32,7 @@ KernelSignature AbsDoubleGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(abs, phi::AbsOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(abs_grad, phi::AbsGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(abs_double_grad,
+PD_REGISTER_ARG_MAPPING_FN(abs, phi::AbsOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(abs_grad, phi::AbsGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(abs_double_grad,
                            phi::AbsDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/addmm_sig.cc b/paddle/phi/ops/compat/addmm_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3bc0bb23a71e25aafe1c2e5038a60fdcf865a12
--- /dev/null
+++ b/paddle/phi/ops/compat/addmm_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "addmm_grad",
+      {"Input", "X", "Y", GradVarName("Out")},
+      {"Alpha", "Beta"},
+      {GradVarName("Input"), GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(addmm_grad, phi::AddmmGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/atan2_sig.cc b/paddle/phi/ops/compat/atan2_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a6049e67b668e4cd97e928414bbca10bf29c0c4
--- /dev/null
+++ b/paddle/phi/ops/compat/atan2_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Atan2GradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("atan2_grad",
+                         {"X1", "X2", GradVarName("Out")},
+                         {},
+                         {GradVarName("X1"), GradVarName("X2")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(atan2_grad, phi::Atan2GradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/bce_loss_sig.cc b/paddle/phi/ops/compat/bce_loss_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17f76067d22db57970d86165c9a1a204a3c34bda
--- /dev/null
+++ b/paddle/phi/ops/compat/bce_loss_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BCELossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("bce_loss_grad",
+                         {"X", "Label", GradVarName("Out")},
+                         {},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(bce_loss_grad, phi::BCELossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc b/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..570bf7ce943d6de8693639bacf50c5883b2ec4e2
--- /dev/null
+++ b/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BilinearTensorProductOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}, {}, {"Out"});
+}
+
+KernelSignature BilinearTensorProductGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("bilinear_tensor_product_grad",
+                         {"X", "Y", "Weight", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"),
+                          GradVarName("Y"),
+                          GradVarName("Weight"),
+                          GradVarName("Bias")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product,
+                           phi::BilinearTensorProductOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product_grad,
+                           phi::BilinearTensorProductGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cast_sig.cc b/paddle/phi/ops/compat/cast_sig.cc
index 79cf59f32990e93b907d6965fc9749ee8cd80f1c..3d970e92a7d688e274b1ad4b45ed4a0014d7ee27 100644
--- a/paddle/phi/ops/compat/cast_sig.cc
+++ b/paddle/phi/ops/compat/cast_sig.cc
@@ -22,4 +22,4 @@ KernelSignature CastOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(cast, phi::CastOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cast, phi::CastOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cholesky_sig.cc b/paddle/phi/ops/compat/cholesky_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c7ca75704669bf3af3c3b698deb8f61a6501693
--- /dev/null
+++ b/paddle/phi/ops/compat/cholesky_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CholeskyGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cholesky_grad",
+                         {"Out", GradVarName("Out")},
+                         {"upper"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cholesky_grad, phi::CholeskyGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/complex_sig.cc b/paddle/phi/ops/compat/complex_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9f59c97fb50f1b5baaf907f3308f94e7e624424
--- /dev/null
+++ b/paddle/phi/ops/compat/complex_sig.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RealGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "real_grad", {GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+KernelSignature ImagGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "imag_grad", {GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(real_grad, phi::RealGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(imag_grad, phi::ImagGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/concat_sig.cc b/paddle/phi/ops/compat/concat_sig.cc
index de37b973409e94e3464fcfebc27c9096478abd8e..21e653ccfe90f8643f601324d1310452924dd1ee 100644
--- a/paddle/phi/ops/compat/concat_sig.cc
+++ b/paddle/phi/ops/compat/concat_sig.cc
@@ -25,4 +25,4 @@ KernelSignature ConcatOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(concat, phi::ConcatOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(concat, phi::ConcatOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cross_sig.cc b/paddle/phi/ops/compat/cross_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..307c2ac5164b5daf24ae95ce3e6de53d9a7bfad0
--- /dev/null
+++ b/paddle/phi/ops/compat/cross_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CrossOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("cross", {"X", "Y"}, {"dim"}, {"Out"});
+}
+
+KernelSignature CrossGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("cross_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cross, phi::CrossOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cross_grad, phi::CrossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a14b9095c8343f47e1d6aa039c9aced963984ce
--- /dev/null
+++ b/paddle/phi/ops/compat/diag_sig.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag);
+
+PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/diagonal_sig.cc b/paddle/phi/ops/compat/diagonal_sig.cc
index 430edea89bea2ab632a0b1fbcc5c14e3e4df502b..b4a424ec06bf2b018de5a0aea4d268f669685fe9 100644
--- a/paddle/phi/ops/compat/diagonal_sig.cc
+++ b/paddle/phi/ops/compat/diagonal_sig.cc
@@ -25,4 +25,4 @@ KernelSignature DiagonalGradOpArgumentMapping(
 }
 
 }  // namespace phi
-PT_REGISTER_ARG_MAPPING_FN(diagonal_grad, phi::DiagonalGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(diagonal_grad, phi::DiagonalGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/ops/compat/digamma_sig.cc
index 555c16ef6b6bf1a5044f60b1be8971c180ca2b0c..fa693f92c6fe3ade527953b632bf94cf4c1b10c1 100644
--- a/paddle/phi/ops/compat/digamma_sig.cc
+++ b/paddle/phi/ops/compat/digamma_sig.cc
@@ -24,4 +24,4 @@ KernelSignature DigammaGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/dot_sig.cc b/paddle/phi/ops/compat/dot_sig.cc
index 481bd3a4949d8cedb2f222ecd74acd9b6b0960c1..2437ecc1ca76720007f68ddb94439f03cb291a9a 100644
--- a/paddle/phi/ops/compat/dot_sig.cc
+++ b/paddle/phi/ops/compat/dot_sig.cc
@@ -25,4 +25,4 @@ KernelSignature DotGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index dfffa034f1d1de8f35e683d880f518d50df02cf6..cddebcbce1273a2f88d7b1ce50f1c340d313ecf0 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -102,28 +102,28 @@ KernelSignature ElementwiseSubGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
-
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add,
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_sub,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_sub,
                            phi::ElementwiseSubOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_mul,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul,
                            phi::ElementwiseMulOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_div,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div,
                            phi::ElementwiseDivOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
                            phi::ElementwiseAddGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
                            phi::ElementwiseAddDoubleGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
                            phi::ElementwiseAddTripleGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
                            phi::ElementwiseSubGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/empty_sig.cc b/paddle/phi/ops/compat/empty_sig.cc
index 9315fdf827dcf0cacac87dbff98b17ca85125993..42cd55bdc0cdab412912f9b1d5d731f18f7b6237 100644
--- a/paddle/phi/ops/compat/empty_sig.cc
+++ b/paddle/phi/ops/compat/empty_sig.cc
@@ -28,4 +28,4 @@ KernelSignature EmptyOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(empty, phi::EmptyOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(empty, phi::EmptyOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/erfinv_sig.cc b/paddle/phi/ops/compat/erfinv_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..490573191533f506bce082b264a9cf0520125d67
--- /dev/null
+++ b/paddle/phi/ops/compat/erfinv_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ErfinvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "erfinv_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(erfinv_grad, phi::ErfinvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/expand_sig.cc b/paddle/phi/ops/compat/expand_sig.cc
index 3f7ff458296c7e4797f331ffd79650fc986fa054..3b2e468267da03ba97917a4899508f1fa3b9b283 100644
--- a/paddle/phi/ops/compat/expand_sig.cc
+++ b/paddle/phi/ops/compat/expand_sig.cc
@@ -47,8 +47,8 @@ KernelSignature ExpandGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(expand_v2, expand);
-PT_REGISTER_BASE_KERNEL_NAME(expand_v2_grad, expand_grad);
+PD_REGISTER_BASE_KERNEL_NAME(expand_v2, expand);
+PD_REGISTER_BASE_KERNEL_NAME(expand_v2_grad, expand_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(expand_v2, phi::ExpandOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(expand_v2_grad, phi::ExpandGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_v2, phi::ExpandOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_v2_grad, phi::ExpandGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/eye_sig.cc b/paddle/phi/ops/compat/eye_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6dafb642795d116ad85e4544a0b8c4a8496d6291
--- /dev/null
+++ b/paddle/phi/ops/compat/eye_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature EyeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "eye", {}, {"num_rows", "num_columns", "dtype"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(eye, phi::EyeOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/fill_any_like_sig.cc b/paddle/phi/ops/compat/fill_any_like_sig.cc
index 3fbd022ca6a9a9d4dd452530f8beaa6086e158eb..84af155d402d6ba8034f5e65a9b9b6e0d74ffff4 100644
--- a/paddle/phi/ops/compat/fill_any_like_sig.cc
+++ b/paddle/phi/ops/compat/fill_any_like_sig.cc
@@ -23,6 +23,6 @@ KernelSignature FillAnyLikeOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(fill_any_like, full_like);
+PD_REGISTER_BASE_KERNEL_NAME(fill_any_like, full_like);
 
-PT_REGISTER_ARG_MAPPING_FN(fill_any_like, phi::FillAnyLikeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fill_any_like, phi::FillAnyLikeOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/fill_constant_sig.cc b/paddle/phi/ops/compat/fill_constant_sig.cc
index 85dfdc3db3eaed742e58d1c857896198c316349e..df28a7b81b61b5c424a11c7484a8b60696f47d10 100644
--- a/paddle/phi/ops/compat/fill_constant_sig.cc
+++ b/paddle/phi/ops/compat/fill_constant_sig.cc
@@ -123,6 +123,6 @@ KernelSignature FillConstantOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(fill_constant, full);
+PD_REGISTER_BASE_KERNEL_NAME(fill_constant, full);
 
-PT_REGISTER_ARG_MAPPING_FN(fill_constant, phi::FillConstantOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fill_constant, phi::FillConstantOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/flatten_sig.cc b/paddle/phi/ops/compat/flatten_sig.cc
index ae5f438cafc248d265d4c68c68e1057a2fbae487..b72ad05ea09d8d3525c5699f73103b9d40adef90 100644
--- a/paddle/phi/ops/compat/flatten_sig.cc
+++ b/paddle/phi/ops/compat/flatten_sig.cc
@@ -36,10 +36,10 @@ KernelSignature FlattenGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range, flatten);
-PT_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range_grad, flatten_grad);
+PD_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range, flatten);
+PD_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range_grad, flatten_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range,
+PD_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range,
                            phi::FlattenOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range_grad,
+PD_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range_grad,
                            phi::FlattenGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gumbel_softmax_sig.cc b/paddle/phi/ops/compat/gumbel_softmax_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7585a4e5f39acc2d7793526f6a5ca7948c370f3
--- /dev/null
+++ b/paddle/phi/ops/compat/gumbel_softmax_sig.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GumbelSoftmaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("gumbel_softmax_grad",
+                         {"Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gumbel_softmax_grad,
+                           phi::GumbelSoftmaxGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/histogram_sig.cc b/paddle/phi/ops/compat/histogram_sig.cc
index 0fd1fdea7642481c73310518edb726c2deddfae0..0cea146ea4e7fcde06d1bca25f57194371a8451b 100644
--- a/paddle/phi/ops/compat/histogram_sig.cc
+++ b/paddle/phi/ops/compat/histogram_sig.cc
@@ -22,4 +22,4 @@ KernelSignature HistogramOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(histogram, phi::HistogramOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(histogram, phi::HistogramOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/huber_loss_sig.cc b/paddle/phi/ops/compat/huber_loss_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f669a4a8b697a1df83429773b257014d709756c
--- /dev/null
+++ b/paddle/phi/ops/compat/huber_loss_sig.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature HuberLossOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "huber_loss", {"X", "Y"}, {"delta"}, {"Out", "Residual"});
+}
+
+KernelSignature HuberLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("huber_loss_grad",
+                         {"Residual", GradVarName("Out")},
+                         {"delta"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(huber_loss, phi::HuberLossOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(huber_loss_grad,
+                           phi::HuberLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/label_smooth_sig.cc b/paddle/phi/ops/compat/label_smooth_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4fb62a8ca2675e08896cbd40ea44b5225e5d02a5
--- /dev/null
+++ b/paddle/phi/ops/compat/label_smooth_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LabelSmoothOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "label_smooth", {"X", "PriorDist"}, {"epsilon"}, {"Out"});
+}
+
+KernelSignature LabelSmoothGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("label_smooth_grad",
+                         {GradVarName("Out")},
+                         {"epsilon"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(label_smooth, phi::LabelSmoothOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(label_smooth_grad,
+                           phi::LabelSmoothGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/lerp_sig.cc b/paddle/phi/ops/compat/lerp_sig.cc
index d33a714048bd00fae57d5ff4eeab8158ec6a49cb..3a8b23ca4c4a4a87f1b157679fd4e2d769deeb29 100644
--- a/paddle/phi/ops/compat/lerp_sig.cc
+++ b/paddle/phi/ops/compat/lerp_sig.cc
@@ -29,5 +29,5 @@ KernelSignature LerpGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(lerp, phi::LerpOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(lerp_grad, phi::LerpGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(lerp, phi::LerpOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(lerp_grad, phi::LerpGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/masked_select_sig.cc b/paddle/phi/ops/compat/masked_select_sig.cc
index 77a97d103e8890eecb84c8e93e47dfde6b22ec5d..8083b123bcff53deb67db667d5166a8d1da3f95d 100644
--- a/paddle/phi/ops/compat/masked_select_sig.cc
+++ b/paddle/phi/ops/compat/masked_select_sig.cc
@@ -31,6 +31,6 @@ KernelSignature MaskedSelectGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(masked_select, phi::MaskedSelectOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(masked_select_grad,
+PD_REGISTER_ARG_MAPPING_FN(masked_select, phi::MaskedSelectOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(masked_select_grad,
                            phi::MaskedSelectGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/matmul_sig.cc b/paddle/phi/ops/compat/matmul_sig.cc
index d4106cd39e30445482b813438241aa314e8eea46..771a7c3acc39dfade48c210d9937fbf719ad911a 100644
--- a/paddle/phi/ops/compat/matmul_sig.cc
+++ b/paddle/phi/ops/compat/matmul_sig.cc
@@ -49,13 +49,13 @@ KernelSignature MatmulTripleGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2, matmul);
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad, matmul_grad);
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad_grad, matmul_double_grad);
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_triple_grad, matmul_triple_grad);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2, matmul);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad, matmul_grad);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad_grad, matmul_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_triple_grad, matmul_triple_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(matmul_v2_grad, phi::MatmulGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(matmul_v2_grad_grad,
+PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad, phi::MatmulGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad_grad,
                            phi::MatmulDoubleGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(matmul_v2_triple_grad,
+PD_REGISTER_ARG_MAPPING_FN(matmul_v2_triple_grad,
                            phi::MatmulTripleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mv_sig.cc b/paddle/phi/ops/compat/mv_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab0d31ee31dab99125a28d5b9f662d25d8e408d0
--- /dev/null
+++ b/paddle/phi/ops/compat/mv_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"});
+}
+
+KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("mv_grad",
+                         {"X", "Vec", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Vec")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/norm_sig.cc b/paddle/phi/ops/compat/norm_sig.cc
index f67c22ba712c8cb84af74063d1d55938b58d46fd..81d294b84248578d5f29b3b2d432e81f3499e9fa 100644
--- a/paddle/phi/ops/compat/norm_sig.cc
+++ b/paddle/phi/ops/compat/norm_sig.cc
@@ -30,5 +30,5 @@ KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(norm, phi::NormOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(norm_grad, phi::NormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(norm, phi::NormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(norm_grad, phi::NormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/pixel_shuffle_sig.cc b/paddle/phi/ops/compat/pixel_shuffle_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..641288cf12ae2e44147f6bd35434a6661727e9cd
--- /dev/null
+++ b/paddle/phi/ops/compat/pixel_shuffle_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PixelShuffleOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "pixel_shuffle", {"X"}, {"upscale_factor", "data_format"}, {"Out"});
+}
+
+KernelSignature PixelShuffleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("pixel_shuffle_grad",
+                         {GradVarName("Out")},
+                         {"upscale_factor", "data_format"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle, phi::PixelShuffleOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle_grad,
+                           phi::PixelShuffleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/poisson_sig.cc b/paddle/phi/ops/compat/poisson_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb6ae28804669327af6a99b7732ef9c73d30bbe4
--- /dev/null
+++ b/paddle/phi/ops/compat/poisson_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PoissonGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("poisson_grad", {}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(poisson_grad, phi::PoissonGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/randint_sig.cc b/paddle/phi/ops/compat/randint_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb6da78a258bc415b54fd128655bae422b3b711c
--- /dev/null
+++ b/paddle/phi/ops/compat/randint_sig.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RandintOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  int seed = paddle::any_cast<int>(ctx.Attr("seed"));
+  if (seed) {
+    if (ctx.InputSize("ShapeTensorList") > 0) {
+      return KernelSignature(
+          "randint_raw",
+          {},
+          {"low", "high", "ShapeTensorList", "seed", "dtype"},
+          {"Out"});
+    } else {
+      const auto& shape =
+          paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+      if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+        return KernelSignature("randint_raw",
+                               {},
+                               {"low", "high", "ShapeTensor", "seed", "dtype"},
+                               {"Out"});
+      } else {
+        return KernelSignature("randint_raw",
+                               {},
+                               {"low", "high", "shape", "seed", "dtype"},
+                               {"Out"});
+      }
+    }
+  } else {
+    if (ctx.InputSize("ShapeTensorList") > 0) {
+      return KernelSignature(
+          "randint", {}, {"low", "high", "ShapeTensorList", "dtype"}, {"Out"});
+    } else {
+      const auto& shape =
+          paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+      if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+        return KernelSignature(
+            "randint", {}, {"low", "high", "ShapeTensor", "dtype"}, {"Out"});
+      } else {
+        return KernelSignature(
+            "randint", {}, {"low", "high", "shape", "dtype"}, {"Out"});
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(randint, phi::RandintOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/randperm_sig.cc b/paddle/phi/ops/compat/randperm_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89548beff6762a59edbe317383714502b4382efe
--- /dev/null
+++ b/paddle/phi/ops/compat/randperm_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RandpermOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  int seed = paddle::any_cast<int>(ctx.Attr("seed"));
+  if (seed) {
+    return KernelSignature("randperm", {}, {"n", "dtype", "seed"}, {"Out"});
+  } else {
+    return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(randperm, phi::RandpermOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 2d16817ad886b6a2d6df4cee1e8a3209499ab50e..74704671f8b5d244b2c3b07ada5e592a8c64da27 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -45,8 +45,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
-PT_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 
-PT_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reshape_sig.cc b/paddle/phi/ops/compat/reshape_sig.cc
index 8e8b7592f909adaa3de7e4357b3adf9d812704c2..b6d10dabb1c7f60432c7653461e41fd086273964 100644
--- a/paddle/phi/ops/compat/reshape_sig.cc
+++ b/paddle/phi/ops/compat/reshape_sig.cc
@@ -45,11 +45,11 @@ KernelSignature ReshapeDoubleGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(reshape2, reshape);
-PT_REGISTER_BASE_KERNEL_NAME(reshape2_grad, reshape_grad);
-PT_REGISTER_BASE_KERNEL_NAME(reshape2_grad_grad, reshape_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reshape2, reshape);
+PD_REGISTER_BASE_KERNEL_NAME(reshape2_grad, reshape_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reshape2_grad_grad, reshape_double_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(reshape2, phi::ReshapeOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(reshape2_grad, phi::ReshapeGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(reshape2_grad_grad,
+PD_REGISTER_ARG_MAPPING_FN(reshape2, phi::ReshapeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reshape2_grad, phi::ReshapeGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reshape2_grad_grad,
                            phi::ReshapeDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/scale_sig.cc b/paddle/phi/ops/compat/scale_sig.cc
index da8d028b2e39ad31c08d320b1f1eaddff62a617a..95deb007d99d9c42bbc2cc22faed2a44fa58b0f5 100644
--- a/paddle/phi/ops/compat/scale_sig.cc
+++ b/paddle/phi/ops/compat/scale_sig.cc
@@ -20,7 +20,7 @@ namespace phi {
  * Note [ Why does the ArgumentMapping function need to be so complicated? ]
  *
  * In order to meet the requirements of infrt, the function used to match Op
- * and Kernel parameters, need to be placed in pten as a compatible component,
+ * and Kernel parameters, need to be placed in phi as a compatible component,
  * and does not depend on fluid.
  *
  * Because infrt not only needs to dynamically call this argument mapping
@@ -72,4 +72,4 @@ KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) {
 }  // namespace phi
 
 // op_type, api_name, arg_mapping_fn
-PT_REGISTER_ARG_MAPPING_FN(scale, phi::ScaleOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(scale, phi::ScaleOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/softmax_sig.cc b/paddle/phi/ops/compat/softmax_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65a915b51d08a85acf16d4206faa765dc6434d8c
--- /dev/null
+++ b/paddle/phi/ops/compat/softmax_sig.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("softmax", {"X"}, {"axis"}, {"Out"});
+}
+
+KernelSignature SoftmaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("softmax_grad",
+                         {"Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(softmax, phi::SoftmaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(softmax_grad, phi::SoftmaxGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/split_sig.cc b/paddle/phi/ops/compat/split_sig.cc
index 361a928e7539488adadefe9b170a725afa32b4fd..b3a614aab001269a58c11f331a03355de62dd5d9 100644
--- a/paddle/phi/ops/compat/split_sig.cc
+++ b/paddle/phi/ops/compat/split_sig.cc
@@ -46,4 +46,4 @@ KernelSignature SplitOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(split, phi::SplitOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(split, phi::SplitOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/trace_sig.cc b/paddle/phi/ops/compat/trace_sig.cc
index 774ac5a944f5961ecbc8cfb1698a3806210ed98c..44fd53db98a3cf12098a676d1a2abf0bc629bb70 100644
--- a/paddle/phi/ops/compat/trace_sig.cc
+++ b/paddle/phi/ops/compat/trace_sig.cc
@@ -30,5 +30,5 @@ KernelSignature TraceGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(trace, phi::TraceOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(trace_grad, phi::TraceGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trace, phi::TraceOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trace_grad, phi::TraceGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/trunc_sig.cc b/paddle/phi/ops/compat/trunc_sig.cc
index 47fa5bc47b4b50a6e3451c155b607203dcdf1c29..2d35439216da522ecc3f279814226afeb3e24948 100644
--- a/paddle/phi/ops/compat/trunc_sig.cc
+++ b/paddle/phi/ops/compat/trunc_sig.cc
@@ -27,5 +27,5 @@ KernelSignature TruncGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(trunc, phi::TruncOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(trunc_grad, phi::TruncGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trunc, phi::TruncOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trunc_grad, phi::TruncGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/truncated_gaussian_random_sig.cc b/paddle/phi/ops/compat/truncated_gaussian_random_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c4d47f8c7221f0447159aef34812d92b4b80ff6
--- /dev/null
+++ b/paddle/phi/ops/compat/truncated_gaussian_random_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TruncatedGaussianRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("truncated_gaussian_random",
+                         {},
+                         {"shape", "mean", "std", "seed", "dtype"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(truncated_gaussian_random,
+                           phi::TruncatedGaussianRandomOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/unfold_sig.cc b/paddle/phi/ops/compat/unfold_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ddc3b1813cbef7b562369df8537260e9de6c017f
--- /dev/null
+++ b/paddle/phi/ops/compat/unfold_sig.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UnfoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("unfold_grad",
+                         {"X", GradVarName("Y")},
+                         {"kernel_sizes", "strides", "paddings", "dilations"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(unfold_grad, phi::UnfoldGradOpArgumentMapping);
diff --git a/paddle/phi/tests/CMakeLists.txt b/paddle/phi/tests/CMakeLists.txt
index ab5da613199be8d021392f9f5db6d8eed38c30b7..3bc13e55eb8a21edb87758c55e0f3d19475d4850 100644
--- a/paddle/phi/tests/CMakeLists.txt
+++ b/paddle/phi/tests/CMakeLists.txt
@@ -2,4 +2,4 @@ add_subdirectory(api)
 add_subdirectory(common)
 add_subdirectory(core)
 add_subdirectory(kernels)
-add_subdirectory(ops_signature)
+add_subdirectory(ops)
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index d875dbd4444ae664472663caa0ea5b2694ca8e4f..cde085423e482e62a280815700ead9a0b6c64262 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -1,27 +1,27 @@
 if(WITH_ROCM)
-  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api manual_api glog)
+  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
 else()
-  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api manual_api glog)
+  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
 endif()
 
-cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
+cc_test(test_phi_exception SRCS test_pten_exception.cc DEPS gtest)
 
-cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_empty_api SRCS test_empty_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_cast_api SRCS test_cast_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_to_api SRCS test_to_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_slice_api SRCS test_slice_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_sum_api SRCS test_sum_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_conj_api SRCS test_conj_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_concat_api SRCS test_concat_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_split_api SRCS test_split_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_data_transform SRCS test_data_transform.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_empty_api SRCS test_empty_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_cast_api SRCS test_cast_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_to_api SRCS test_to_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_slice_api SRCS test_slice_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_sum_api SRCS test_sum_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_conj_api SRCS test_conj_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index b6ca081e9786650bf9a446e4a47e00c9e7a36edb..d93f00129b9a14170b979dfd23eb6e292e996ce8 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -42,7 +42,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
       kernel_layout == DataLayout::UNDEFINED ||
       kernel_data_type == DataType::UNDEFINED) {
     auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
     if (kernel_backend == Backend::UNDEFINED) {
       kernel_backend = kernel_key.backend();
     }
@@ -71,7 +71,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
 
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(kernel_backend)),
+          phi::TransToPhiPlace(kernel_backend)),
       phi::DenseTensorMeta());
   phi::MetaTensor meta_out(dense_out.get());
   phi::UnchangedInferMeta(*dense_x, &meta_out);
@@ -215,7 +215,7 @@ Tensor scale_switch_case(const Tensor& x,
       kernel_layout == DataLayout::UNDEFINED ||
       kernel_data_type == DataType::UNDEFINED) {
     auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
     if (kernel_backend == Backend::UNDEFINED) {
       kernel_backend = kernel_key.backend();
     }
@@ -238,7 +238,7 @@ Tensor scale_switch_case(const Tensor& x,
 
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(kernel_backend)),
+          phi::TransToPhiPlace(kernel_backend)),
       phi::DenseTensorMeta());
   phi::MetaTensor meta_out(dense_out.get());
   phi::UnchangedInferMeta(*dense_x, &meta_out);
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index 3df1866efb0df69fcccf9f18a13235940db2b35b..a3c497bd427ae040b33dce241a70ecaafee5fbcc 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -84,7 +83,7 @@ TEST(Tensor, data_transform_diff_place) {
   ASSERT_EQ(out.layout(), phi::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
   ASSERT_EQ(out.impl()->place(),
-            phi::TransToPtenPlace(experimental::Backend::GPU));
+            phi::TransToPhiPlace(experimental::Backend::GPU));
 
   auto ref_out = experimental::copy_to(out, experimental::Backend::CPU, true);
 
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index fd8a127b7c77d922b44acb8c7a72cd3da9941321..2a3dd9c7dff62071fcd7dcf18cddcc5946ff7480 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
+#include "paddle/phi/api/backward/backward_api.h"
 #include "paddle/phi/api/include/api.h"
 
 #include "paddle/phi/api/lib/utils/allocator.h"
@@ -127,8 +128,8 @@ TEST(API, matmul_cuda) {
   auto place = paddle::platform::CUDAPlace();
   auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.GetByPlace(place));
 
-  phi::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get());
-  phi::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get());
+  phi::Copy(*dev_ctx, *ref_x.get(), phi::GPUPlace(), false, dense_x.get());
+  phi::Copy(*dev_ctx, *ref_y.get(), phi::GPUPlace(), false, dense_y.get());
 
   paddle::experimental::Tensor x(dense_x);
   paddle::experimental::Tensor y(dense_y);
@@ -152,7 +153,7 @@ TEST(API, matmul_cuda) {
       phi::DenseTensorMeta(
           phi::DataType::FLOAT32, out.dims(), phi::DataLayout::NCHW));
 
-  phi::Copy(*dev_ctx, *dense_out.get(), false, ref_out.get());
+  phi::Copy(*dev_ctx, *dense_out.get(), phi::CPUPlace(), false, ref_out.get());
 
   for (size_t i = 0; i < 9; i++) {
     ASSERT_NEAR(sum[i], ref_out->data<float>()[i], 1e-6f);
@@ -161,5 +162,31 @@ TEST(API, matmul_cuda) {
 
 #endif
 
+TEST(API, matmul_double_grad) {
+  // 1. create tensor
+  auto x = paddle::experimental::full({3, 3}, 1.0);
+  auto y = paddle::experimental::full({3, 3}, 2.0);
+  auto out_grad = paddle::experimental::full({3, 3}, 2.0);
+  auto dx_grad = paddle::experimental::full({3, 3}, 2.0);
+
+  // 2. test API
+  const auto out = paddle::experimental::matmul_double_grad(
+      x, y, out_grad, dx_grad, {}, false, false);
+
+  // 3. check result
+  ASSERT_EQ(out.size(), 3UL);
+  ASSERT_EQ(out[0].size(), 1UL);
+  ASSERT_EQ(out[1].size(), 1UL);
+  ASSERT_EQ(out[2].size(), 1UL);
+  ASSERT_EQ(out[0][0].dims()[1], 3);
+  ASSERT_EQ(out[0][0].numel(), 9);
+  ASSERT_EQ(out[1][0].numel(), 9);
+  ASSERT_EQ(out[2][0].numel(), 9);
+  ASSERT_EQ(out[0][0].type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(out[0][0].layout(), phi::DataLayout::NCHW);
+  ASSERT_EQ(out[1][0].initialized(), true);
+  ASSERT_EQ(out[2][0].initialized(), true);
+}
+
 }  // namespace tests
 }  // namespace paddle
diff --git a/paddle/phi/tests/api/test_pten_tensor.cc b/paddle/phi/tests/api/test_pten_tensor.cc
index de88561c4d675c137afb3fab664342f15de72c86..dc2883c1794e2c986ed5446981b749f5f4dd0bc2 100644
--- a/paddle/phi/tests/api/test_pten_tensor.cc
+++ b/paddle/phi/tests/api/test_pten_tensor.cc
@@ -211,7 +211,7 @@ void TestJudgeTensorType() {
   CHECK(test_tensor.is_dense_tensor() == true);
 }
 
-TEST(PtenTensor, All) {
+TEST(PhiTensor, All) {
   VLOG(2) << "TestCopy";
   GroupTestCopy();
   VLOG(2) << "TestDtype";
diff --git a/paddle/phi/tests/api/test_split_api.cc b/paddle/phi/tests/api/test_split_api.cc
index 9b3478e85e04ca08ac0c3c687068ea53b20e662d..0b836a010586d775ced1e7c196b1b5139ac42fc1 100644
--- a/paddle/phi/tests/api/test_split_api.cc
+++ b/paddle/phi/tests/api/test_split_api.cc
@@ -17,7 +17,6 @@
 
 #include "paddle/phi/api/include/api.h"
 
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index c790e7bfa71da4820e793827984da83ddf59743c..d337a0b601a00d6ae0423b7184d4d2c5cc6ef2b8 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/phi/api/include/manual_api.h"
+#include "paddle/phi/api/include/api.h"
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt
index f54b37cb976c54a784d25c438614afcc14300a81..710ea3c06647205171289812be6fa7f18a8fb8d0 100644
--- a/paddle/phi/tests/common/CMakeLists.txt
+++ b/paddle/phi/tests/common/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_test(pten_test_backend SRCS test_backend.cc DEPS gtest)
-cc_test(pten_test_data_layout SRCS test_data_layout.cc DEPS gtest)
-cc_test(pten_test_data_type SRCS test_data_type.cc DEPS gtest)
-cc_test(pten_test_place SRCS test_place.cc DEPS pten_place)
+cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest)
+cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest)
+cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest)
+cc_test(phi_test_place SRCS test_place.cc DEPS phi_place)
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index 941c00d9fea8b2a1555edd7e311ba7ed0e085a6c..fa4ffc84bf587defae06deb18dae283a64206b75 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -41,8 +41,8 @@ TEST(Backend, OStream) {
   oss << phi::Backend::MKLDNN;
   EXPECT_EQ(oss.str(), "MKLDNN");
   oss.str("");
-  oss << phi::Backend::CUDNN;
-  EXPECT_EQ(oss.str(), "CUDNN");
+  oss << phi::Backend::GPUDNN;
+  EXPECT_EQ(oss.str(), "GPUDNN");
   oss.str("");
   try {
     oss << phi::Backend::NUM_BACKENDS;
@@ -52,5 +52,19 @@ TEST(Backend, OStream) {
   }
 }
 
+TEST(Backend, StringToBackend) {
+  namespace pexp = paddle::experimental;
+  EXPECT_EQ(phi::Backend::UNDEFINED, pexp::StringToBackend("Undefined"));
+  EXPECT_EQ(phi::Backend::CPU, pexp::StringToBackend("CPU"));
+  EXPECT_EQ(phi::Backend::GPU, pexp::StringToBackend("GPU"));
+  EXPECT_EQ(phi::Backend::XPU, pexp::StringToBackend("XPU"));
+  EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
+  EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
+  EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN"));
+  EXPECT_EQ(static_cast<phi::Backend>(
+                static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
+            pexp::StringToBackend("CustomBackend"));
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/common/test_place.cc b/paddle/phi/tests/common/test_place.cc
index c311a6733b04df645c0ee4c70e04b9f635377b04..ed2eb7126ed289c0eb31f4ac14be8492515afa60 100644
--- a/paddle/phi/tests/common/test_place.cc
+++ b/paddle/phi/tests/common/test_place.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace phi {
 namespace tests {
 
-TEST(PtenPlace, place) {
+TEST(PhiPlace, place) {
   phi::Place place;
   EXPECT_EQ(place.GetType(), phi::AllocationType::UNDEFINED);
 
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 971d9112eead97f46ab1f165c9073ac525464676..5356bac9fbd808f1f75eb13f4406d6d0661e60bd 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,3 +1,4 @@
+cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS phi_custom_kernel)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
@@ -5,7 +6,7 @@ cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scal
 cc_test(test_sparse_coo_tensor SRCS test_sparse_coo_tensor.cc DEPS dense_tensor sparse_coo_tensor)
 cc_test(test_sparse_csr_tensor SRCS test_sparse_csr_tensor.cc DEPS dense_tensor sparse_csr_tensor)
 cc_test(test_op_utils SRCS test_op_utils.cc DEPS op_compat_infos)
-cc_test(test_pten_device_context SRCS test_device_context.cc DEPS pten_context cpu_context)
+cc_test(test_phi_device_context SRCS test_device_context.cc DEPS phi_context cpu_context)
 cc_test(test_meta_fn_utils SRCS test_meta_fn_utils.cc DEPS dense_tensor wrapped_infermeta infermeta infermeta_utils)
 
 cc_test(test_ddim SRCS test_ddim.cc DEPS ddim)
diff --git a/paddle/phi/tests/core/allocator.h b/paddle/phi/tests/core/allocator.h
index 66e5b4885c8363550561df84685950767c76914b..b92178eba3045365806f2cd94a36d74794be83f3 100644
--- a/paddle/phi/tests/core/allocator.h
+++ b/paddle/phi/tests/core/allocator.h
@@ -29,8 +29,7 @@ class FancyAllocator : public phi::Allocator {
 
   AllocationPtr Allocate(size_t bytes_size) override {
     void* data = ::operator new(bytes_size);
-    auto* allocation =
-        new phi::Allocation(data, bytes_size, paddle::platform::CPUPlace());
+    auto* allocation = new phi::Allocation(data, bytes_size, phi::CPUPlace());
     return AllocationPtr(allocation, Delete);
   }
 };
diff --git a/paddle/fluid/framework/custom_kernel_test.cc b/paddle/phi/tests/core/test_custom_kernel.cc
similarity index 69%
rename from paddle/fluid/framework/custom_kernel_test.cc
rename to paddle/phi/tests/core/test_custom_kernel.cc
index fb3cc0a35f0e02cc954653649212e91ddeb94e5c..d8e42c9d0d8b11d393dbb71776671d9cb50a7715 100644
--- a/paddle/fluid/framework/custom_kernel_test.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -17,24 +17,21 @@ limitations under the License. */
 #define _LINUX
 #endif
 
-#include "paddle/fluid/framework/custom_kernel.h"
-
-#include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "paddle/extension.h"
+
+#ifdef _LINUX
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_kernel_info_helper.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/api/lib/utils/storage.h"
-#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
-#include "paddle/utils/small_vector.h"
 
-#ifdef _LINUX
 // user kernel function
 namespace custom_kernel {
 
@@ -43,17 +40,23 @@ namespace custom_kernel {
 // attribute 11: fake_attributes
 // output 2: one Tensor* and one std::vector<Tensor*>
 template <typename T, typename Context>
-void FakeDot(const Context& dev_ctx, const paddle::Tensor& x,
-             const paddle::Tensor& y,
-             const std::vector<paddle::Tensor>& fake_input_vec,
-             bool fake_attr_bool, int fake_attr_int, float fake_attr_float,
-             double fake_attr_double, int64_t fake_attr_int64,
-             phi::dtype::float16 fake_attr_f16, phi::DataType fake_attr_dtype,
+void FakeDot(const Context& dev_ctx,
+             const phi::DenseTensor& x,
+             const phi::DenseTensor& y,
+             const std::vector<phi::DenseTensor>& fake_input_vec,
+             bool fake_attr_bool,
+             int fake_attr_int,
+             float fake_attr_float,
+             double fake_attr_double,
+             int64_t fake_attr_int64,
+             phi::dtype::float16 fake_attr_f16,
+             phi::DataType fake_attr_dtype,
              const phi::Scalar& fake_attr_scalar,
              const phi::ScalarArray& fake_attr_scalar_array,
              const std::vector<int64_t>& fake_attr_int64_vec,
-             const std::vector<int>& fake_attr_int_vec, paddle::Tensor* out,
-             std::vector<paddle::Tensor*> fake_out_vec) {
+             const std::vector<int>& fake_attr_int_vec,
+             phi::DenseTensor* out,
+             std::vector<phi::DenseTensor*> fake_out_vec) {
   // print param info
   std::cout << "fake_input_vec.size: " << fake_input_vec.size() << std::endl;
   std::cout << "fake_attr_bool: " << fake_attr_bool << std::endl;
@@ -83,10 +86,10 @@ void FakeDot(const Context& dev_ctx, const paddle::Tensor& x,
 
   auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
   auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
-  auto shape = x.shape();
+  T* z = dev_ctx.template Alloc<T>(out);
+  auto&& d = x.dims();
   auto const N = x.numel();
-  auto const B = shape[shape.size() - 1];
+  auto const B = d[d.size() - 1];
   for (int j = 0; j < N / B; j++) {
     T ss = 0;
     for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
@@ -95,8 +98,19 @@ void FakeDot(const Context& dev_ctx, const paddle::Tensor& x,
 }
 }  // namespace custom_kernel
 
-PD_REGISTER_KERNEL(fake_dot, CPU, ALL_LAYOUT, custom_kernel::FakeDot, float,
-                   double, int, int64_t, int8_t, uint8_t) {}
+PD_REGISTER_BUILTIN_KERNEL(fake_dot,
+                           CPU,
+                           ALL_LAYOUT,
+                           custom_kernel::FakeDot,
+                           float,
+                           double,
+                           int,
+                           int64_t,
+                           int8_t,
+                           uint8_t) {}
+
+namespace phi {
+namespace tests {
 
 // Upper code will store dot kernels info into OpKernelInfoMap
 TEST(CustomKernel, custom_kernel_dot) {
@@ -105,33 +119,38 @@ TEST(CustomKernel, custom_kernel_dot) {
   phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT;
 
   // 1.custom kernel info parsed and store
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance().GetMap().find(op_name) !=
-              paddle::OpKernelInfoMap::Instance().GetMap().end());
+  EXPECT_TRUE(phi::CustomKernelMap::Instance().GetMap().find(op_name) !=
+              phi::CustomKernelMap::Instance().GetMap().end());
 
+  auto& custom_kernels = phi::CustomKernelMap::Instance().Kernels();
   // 2.info check
-  EXPECT_EQ(
-      6, static_cast<int>(paddle::OpKernelInfoMap::Instance()[op_name].size()));
-  // index 0
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][0].GetBackend() ==
-              backend);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][0].GetDataLayout() ==
-              layout);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][0].GetDataType() ==
-              phi::DataType::FLOAT32);
-  // index 5
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][5].GetBackend() ==
-              backend);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][5].GetDataLayout() ==
-              layout);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][5].GetDataType() ==
-              phi::DataType::UINT8);
+  EXPECT_EQ(6, static_cast<int>(custom_kernels[op_name].size()));
+  auto& custom_fake_dot_kernels = custom_kernels[op_name];
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::FLOAT64)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::INT32)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::INT64)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::INT8)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::UINT8)) !=
+              custom_fake_dot_kernels.end());
 
   // 3.before register
   auto& kernel_factory_instance = phi::KernelFactory::Instance();
   auto& kernels = phi::KernelFactory::Instance().kernels();
-  EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePtenKernel(op_name));
+  EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name));
 
-  // mock fake_dot is supported by pten for HasCompatiblePtenKernel check while
+  // mock fake_dot is supported by phi for HasCompatiblePhiKernel check while
   // registering
   auto& fake_dot_kernels = kernels[op_name];
 
@@ -155,8 +174,7 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // register
-  paddle::framework::RegisterKernelWithMetaInfoMap(
-      paddle::OpKernelInfoMap::Instance());
+  phi::RegisterCustomKernels(phi::CustomKernelMap::Instance());
 
   EXPECT_TRUE(fake_dot_kernels.find(
                   phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
@@ -186,15 +204,15 @@ TEST(CustomKernel, custom_kernel_dot) {
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(phi::DataType::UINT8, phi::make_ddim({2, 3}),
-                           phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(
+          phi::DataType::UINT8, phi::make_ddim({2, 3}), phi::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<uint8_t>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(phi::DataType::UINT8, phi::make_ddim({2, 3}),
-                           phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(
+          phi::DataType::UINT8, phi::make_ddim({2, 3}), phi::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<uint8_t>(paddle::platform::CPUPlace());
 
@@ -233,7 +251,7 @@ TEST(CustomKernel, custom_kernel_dot) {
   phi::dtype::float16 fake_attr_f16 = phi::dtype::float16(5);
   phi::DataType fake_attr_dtype = phi::DataType::UINT32;
   paddle::framework::LoDTensor tmp_tensor;
-  tmp_tensor.mutable_data<uint8_t>({1}, phi::TransToPtenPlace(backend));
+  tmp_tensor.mutable_data<uint8_t>({1}, phi::TransToPhiPlace(backend));
   phi::Scalar fake_attr_scalar{tmp_tensor};
   phi::ScalarArray fake_attr_scalar_array;
   std::vector<int64_t> fake_attr_int64_vec;
@@ -253,7 +271,7 @@ TEST(CustomKernel, custom_kernel_dot) {
 
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       phi::DenseTensorMeta());
 
   phi::MetaTensor meta_out(dense_out.get());
@@ -288,38 +306,7 @@ TEST(CustomKernel, custom_kernel_dot) {
   ASSERT_EQ(expect_result[1], actual_result1);
 }
 
-// test OpKernelInfoHelper
-TEST(OpKernelInfoHelper, op_kernel_info_help_getters) {
-  using OpKernelInfoHelper = paddle::framework::OpKernelInfoHelper;
-  std::string op_name = "fake_dot";
-  phi::Backend backend = phi::Backend::CPU;
-  phi::DataLayout layout = phi::DataLayout::ANY;
-  phi::DataType dtype = phi::DataType::FLOAT32;
-
-  auto op_kernel_info = paddle::OpKernelInfoMap::Instance()[op_name][0];
-
-  EXPECT_EQ(op_name, OpKernelInfoHelper::GetOpName(op_kernel_info));
-  EXPECT_EQ(backend, OpKernelInfoHelper::GetBackend(op_kernel_info));
-  EXPECT_EQ(layout, OpKernelInfoHelper::GetDataLayout(op_kernel_info));
-  EXPECT_EQ(dtype, OpKernelInfoHelper::GetDataType(op_kernel_info));
-
-  EXPECT_EQ(phi::KernelKey(backend, layout, dtype),
-            OpKernelInfoHelper::GetKernelKey(op_kernel_info));
-
-  paddle::CustomKernelFunc kernel_fn =
-      PD_PT_KERNEL(custom_kernel::FakeDot<float, paddle::CPUContext>);
-  EXPECT_EQ(kernel_fn, OpKernelInfoHelper::GetKernelFn(op_kernel_info));
-
-  void* variadic_func =
-      PD_PT_VARIADIC_KERNEL(custom_kernel::FakeDot<float, paddle::CPUContext>);
-  EXPECT_EQ(variadic_func,
-            OpKernelInfoHelper::GetVariadicKernelFn(op_kernel_info));
-
-  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
-  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
-  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
-  EXPECT_EQ(3, static_cast<int>(input_defs.size()));
-  EXPECT_EQ(2, static_cast<int>(output_defs.size()));
-  EXPECT_EQ(11, static_cast<int>(attribute_defs.size()));
-}
+}  // namespace tests
+}  // namespace phi
+
 #endif
diff --git a/paddle/phi/tests/core/test_dense_tensor.cc b/paddle/phi/tests/core/test_dense_tensor.cc
index 6464ff24d24aa59d2f8eb3d28818af5b552ab658..ddfa184df2c1ede0f953b426f14bd5730ee3a9b0 100644
--- a/paddle/phi/tests/core/test_dense_tensor.cc
+++ b/paddle/phi/tests/core/test_dense_tensor.cc
@@ -85,7 +85,7 @@ TEST(dense_tensor, ctor) {
     r = r && (t.dims() == m.dims);
     r = r && (t.dtype() == m.dtype);
     r = r && (t.layout() == m.layout);
-    r = r && (t.place() == paddle::platform::CPUPlace());
+    r = r && (t.place() == phi::CPUPlace());
     r = r && t.initialized();
     r = r && t.IsSharedWith(t);
     return r;
diff --git a/paddle/phi/tests/core/test_kernel_factory.cc b/paddle/phi/tests/core/test_kernel_factory.cc
index c85485cb91513613077af1ae1fefaa3c491272f9..cb4b50f5b6c3dce52f5d188b86d748a59cd41f1e 100644
--- a/paddle/phi/tests/core/test_kernel_factory.cc
+++ b/paddle/phi/tests/core/test_kernel_factory.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-PT_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
 
 namespace phi {
 namespace tests {
@@ -76,7 +76,7 @@ TEST(KernelRegistry, SetFP32Input) {
 }  // namespace tests
 }  // namespace phi
 
-PT_REGISTER_KERNEL(test,
+PD_REGISTER_KERNEL(test,
                    CPU,
                    ALL_LAYOUT,
                    phi::tests::TestKernel,
diff --git a/paddle/phi/tests/core/test_sparse_coo_tensor.cc b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
index e93f1f0b0ecaffce35c56f939304d0f182d65bfc..5d0e16b0528e7ba73d1b1fea858e8b0529cc9ddf 100644
--- a/paddle/phi/tests/core/test_sparse_coo_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
@@ -53,7 +53,7 @@ TEST(sparse_coo_tensor, construct) {
   CHECK(sparse.dims() == dense_dims);
   CHECK(sparse.dtype() == DataType::FLOAT32);
   CHECK(sparse.layout() == DataLayout::SPARSE_COO);
-  CHECK(sparse.place() == paddle::platform::CPUPlace());
+  CHECK(sparse.place() == phi::CPUPlace());
 }
 
 TEST(sparse_coo_tensor, other_function) {
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index 9682e063471dfac551aa2c844506878f1c97ef46..c92e10f8dd74af072bb8836d65898e2fc9a79bcc 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -1,18 +1,19 @@
-cc_test(test_copy_dev_api SRCS test_copy_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_dot_dev_api SRCS test_dot_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_creation_dev_api SRCS test_creation_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_matmul_dev_api SRCS test_matmul_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_cast_dev_api SRCS test_cast_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_copy_dev_api SRCS test_copy_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_dot_dev_api SRCS test_dot_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_creation_dev_api SRCS test_creation_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_matmul_dev_api SRCS test_matmul_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_cast_dev_api SRCS test_cast_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_sparse_conv3d_dev_api SRCS test_sparse_conv3d_dev_api.cc DEPS phi phi_api_utils)
 
 cc_test(test_math_function SRCS test_math_function.cc DEPS math_function)
 if(WITH_GPU)
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index 4cd283d925ab442fbf54aad85f4954caf0735816..d69c7b2174f726d5757ea707678ddb383cf19d68 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -62,7 +62,8 @@ TEST(DEV_API, copy) {
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
   dev_ctx.Init();
-  phi::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());
+  phi::Copy(
+      dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
 
   // 3. check result
   for (int64_t i = 0; i < dense_src->numel(); i++) {
diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc
index 8e825b7790111ac6b4a11a3d889f7b148dab9db4..8c2c8642ab9005472b74086e70457940b35f8619 100644
--- a/paddle/phi/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc
@@ -39,7 +39,7 @@ TEST(DEV_API, empty) {
   dev_ctx.Init();
 
   // 2. test API
-  auto out = phi::Empty<float>(dev_ctx, {3, 2}, phi::DataType::INT32);
+  auto out = phi::Empty<int>(dev_ctx, {3, 2});
 
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);
@@ -87,7 +87,7 @@ TEST(DEV_API, full) {
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
   dev_ctx.Init();
-  auto out = phi::Full<float>(dev_ctx, {3, 2}, val, phi::DataType::FLOAT32);
+  auto out = phi::Full<float>(dev_ctx, {3, 2}, val);
 
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index b65720a4b4e241020df0fc4360814f7ec10a2aaf..dc283728ee5f761e79c9c396d63121d555139dee 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -23,14 +23,14 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
 #endif
 
 namespace phi {
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..576015143704b86957073bcf3f06b381e4b61592
--- /dev/null
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -0,0 +1,471 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace tests {
+
+std::vector<int> flatten(const std::vector<std::vector<int>>& in) {
+  std::vector<int> out;
+  if (in.size() == 0) return out;
+  const int cols = in[0].size();
+  out.resize(in.size() * cols);
+  for (uint64_t i = 0; i < in.size(); i++) {
+    memcpy(&out[i * cols], in[i].data(), cols * sizeof(int));
+  }
+  return out;
+}
+
+template <typename T1, typename T2>
+std::vector<T2> cast(const std::vector<T1>& in) {
+  std::vector<T2> out(in.size());
+  for (uint64_t i = 0; i < in.size(); i++) {
+    out[i] = static_cast<T2>(in[i]);
+  }
+  return out;
+}
+
+template <typename T>
+void TestConv3dBase(const std::vector<int>& indices,
+                    const std::vector<T>& features,
+                    const DDim& x_dims,
+                    const std::vector<T>& kernel,
+                    const DDim& kernel_dims,
+                    const std::vector<int>& correct_out_indices,
+                    const std::vector<T>& correct_out_features,
+                    const DDim& correct_out_dims,
+                    const int non_zero_num,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& dilations,
+                    const float diff = 1e-3) {
+  phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx_cpu.Init();
+
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+
+  DenseTensor indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  dev_ctx_cpu.Alloc(&indices_tensor,
+                    indices_tensor.dtype(),
+                    sizeof(int) * indices_tensor.numel());
+  memcpy(
+      indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
+  DenseTensor features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  dev_ctx_cpu.Alloc(&features_tensor,
+                    features_tensor.dtype(),
+                    features_tensor.numel() * sizeof(T));
+  memcpy(
+      features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
+
+  SparseCooTensor x_tensor(indices_tensor, features_tensor, x_dims);
+
+  DenseTensor kernel_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      kernel_dims,
+                      DataLayout::NHWC));
+  dev_ctx_cpu.Alloc(
+      &kernel_tensor, kernel_tensor.dtype(), kernel_tensor.numel() * sizeof(T));
+  memcpy(kernel_tensor.data<T>(), kernel.data(), kernel.size() * sizeof(T));
+
+  if (!std::is_same<T, phi::dtype::float16>::value) {
+    DenseTensor rulebook = phi::Empty<int, phi::CPUContext>(dev_ctx_cpu);
+    SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
+                                            x_tensor,
+                                            kernel_tensor,
+                                            paddings,
+                                            dilations,
+                                            strides,
+                                            1,
+                                            &rulebook);
+
+    ASSERT_EQ(correct_out_dims.size(), out.dims().size());
+    for (int i = 0; i < correct_out_dims.size(); i++) {
+      ASSERT_EQ(correct_out_dims[i], out.dims()[i]);
+    }
+    ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz());
+
+    int cmp_indices = memcmp(correct_out_indices.data(),
+                             out.non_zero_indices().data<int>(),
+                             correct_out_indices.size() * sizeof(int));
+    ASSERT_EQ(cmp_indices, 0);
+
+    for (uint64_t i = 0; i < correct_out_features.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(
+          correct_out_features[i] - out.non_zero_elements().data<T>()[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  }
+}
+
+void TestConv3d(const std::vector<int>& indices,
+                const std::vector<float>& features,
+                const DDim& x_dims,
+                const std::vector<float>& kernel,
+                const DDim& kernel_dims,
+                const std::vector<int>& correct_out_indices,
+                const std::vector<float>& correct_out_features,
+                const DDim& correct_out_dims,
+                const int non_zero_num,
+                const std::vector<int>& paddings,
+                const std::vector<int>& strides,
+                const std::vector<int>& dilations) {
+  // test float
+  TestConv3dBase<float>(indices,
+                        features,
+                        x_dims,
+                        kernel,
+                        kernel_dims,
+                        correct_out_indices,
+                        correct_out_features,
+                        correct_out_dims,
+                        non_zero_num,
+                        paddings,
+                        strides,
+                        dilations);
+  // test double
+  TestConv3dBase<double>(indices,
+                         cast<float, double>(features),
+                         x_dims,
+                         cast<float, double>(kernel),
+                         kernel_dims,
+                         correct_out_indices,
+                         cast<float, double>(correct_out_features),
+                         correct_out_dims,
+                         non_zero_num,
+                         paddings,
+                         strides,
+                         dilations);
+}
+
+TEST(DEV_API, sparse_conv3d) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 4, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 2, 2, 2, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 4;
+  std::vector<std::vector<int>> indices = {
+      {0, 0, 0, 0}, {0, 2, 0, 2}, {3, 2, 2, 3}, {3, 2, 3, 2}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {-0.2883, 0.0287, 0.2864, -0.0992};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.4721, 0.2292, 0.9751, 0.8616, 0.5784, 0.9178, 0.8727, 0.1659, 0.4455,
+
+      0.0189, 0.4646, 0.4472, 0.1991, 0.8968, 0.3717, 0.0051, 0.6963, 0.2690,
+
+      0.7473, 0.5403, 0.5391, 0.0796, 0.4734, 0.9097, 0.1712, 0.6237, 0.8837};
+
+  std::vector<std::vector<int>> out_indices = {{0, 0, 0, 0, 0, 0, 0, 0},
+                                               {0, 0, 0, 0, 1, 1, 1, 1},
+                                               {0, 0, 1, 1, 0, 0, 1, 1},
+                                               {0, 1, 0, 1, 0, 1, 0, 1}};
+  std::vector<int> out_indices_flatten = flatten(out_indices);
+
+  std::vector<float> out_features = {
+      0.0254, 0.1455, -0.0615, 0.0862, 0.0077, 0.0200, -0.0160, -0.0433};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv3d_batch) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {2, 4, 4, 4, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {2, 2, 2, 2, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 8;
+  std::vector<std::vector<int>> indices = {{0, 0, 0, 0, 1, 1, 1, 1},
+                                           {0, 2, 0, 2, 0, 2, 0, 2},
+                                           {3, 2, 2, 3, 3, 2, 2, 3},
+                                           {3, 2, 3, 2, 3, 2, 3, 2}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {
+      -0.2883, 0.0287, 0.2864, -0.0992, -0.2883, 0.0287, 0.2864, -0.0992};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.4721, 0.2292, 0.9751, 0.8616, 0.5784, 0.9178, 0.8727, 0.1659, 0.4455,
+
+      0.0189, 0.4646, 0.4472, 0.1991, 0.8968, 0.3717, 0.0051, 0.6963, 0.2690,
+
+      0.7473, 0.5403, 0.5391, 0.0796, 0.4734, 0.9097, 0.1712, 0.6237, 0.8837};
+
+  std::vector<std::vector<int>> out_indices = {
+      {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1},
+      {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1},
+      {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1},
+      {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}};
+  std::vector<int> out_indices_flatten = flatten(out_indices);
+
+  std::vector<float> out_features = {0.0254,
+                                     0.1455,
+                                     -0.0615,
+                                     0.0862,
+                                     0.0077,
+                                     0.0200,
+                                     -0.0160,
+                                     -0.0433,
+                                     0.0254,
+                                     0.1455,
+                                     -0.0615,
+                                     0.0862,
+                                     0.0077,
+                                     0.0200,
+                                     -0.0160,
+                                     -0.0433};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv3d_stride) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 4, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 1, 1, 1, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {2, 2, 2};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<std::vector<int>> indices = {
+      {0, 0, 0}, {0, 2, 0}, {3, 2, 2}, {3, 2, 3}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {-0.28833008, 0.02873230, 0.28637695};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.45043945, 0.47216797, 0.22924805, 0.97509766, 0.86181641, 0.57861328,
+      0.91796875, 0.87255859, 0.16589355, 0.44555664, 0.01889038, 0.46459961,
+      0.44726562, 0.19909668, 0.89697266, 0.37158203, 0.00513077, 0.69628906,
+      0.26904297, 0.74707031, 0.54003906, 0.5390625,  0.07958984, 0.47338867,
+      0.90966797, 0.17126465, 0.62353516};
+
+  std::vector<std::vector<int>> out_indices = {{0, 0, 0, 0}};
+  std::vector<int> out_indices_flatten = flatten(out_indices);
+
+  std::vector<float> out_features = {0.01791};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv3d_dilation) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 6, 6, 6, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 2, 2, 2, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {2, 2, 2};
+
+  const int non_zero_num = 3;
+  std::vector<std::vector<int>> indices = {
+      {0, 0, 0}, {2, 3, 3}, {2, 3, 3}, {5, 2, 0}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {-0.78710938, -0.64746094, 0.98828125};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.20617676, 0.99365234, 0.16760254, 0.30639648, 0.41479492, 0.75732422,
+      0.65625,    0.48535156, 0.72167969, 0.56005859, 0.5,        0.3581543,
+      0.20324707, 0.88769531, 0.81298828, 0.58398438, 0.30810547, 0.12634277,
+      0.70507812, 0.38720703, 0.34814453, 0.02690125, 0.80273438, 0.90625,
+      0.2277832,  0.4362793,  0.44482422};
+
+  std::vector<std::vector<int>> out_indices = {{0, 0, 0, 1, 0, 1, 1, 0}};
+  std::vector<int> out_indices_flatten = flatten(out_indices);
+
+  std::vector<float> out_features = {-0.64014, -0.37402};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv3d_padding) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 3, 3, 3, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 3, 3, 3, out_channels};
+  std::vector<int> paddings = {1, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 1;
+  std::vector<std::vector<int>> indices = {{0, 1, 0, 0}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {-0.79394531};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.34375,    0.22485352, 0.65820312, 0.75048828, 0.21411133, 0.17370605,
+      0.85546875, 0.53076172, 0.28833008, 0.71044922, 0.00659943, 0.45922852,
+      0.19372559, 0.64599609, 0.78808594, 0.49316406, 0.62646484, 0.40649414,
+      0.62744141, 0.5703125,  0.23144531, 0.50048828, 0.31835938, 0.90869141,
+      0.38208008, 0.60449219, 0.09075928};
+
+  std::vector<int> out_indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+      0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+
+  std::vector<float> out_features = {-0.25269,
+                                     -0.39746,
+                                     -0.45288,
+                                     -0.49805,
+                                     -0.5127,
+                                     -0.15381,
+                                     -0.00524,
+                                     -0.56396,
+                                     -0.17004,
+                                     -0.5957,
+                                     -0.17847,
+                                     -0.27295};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv2d) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 1, 5, 5, in_channels};
+  DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 1, 3, 3, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
+
+  std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
+  // 3*3*3=27
+  std::vector<float> kernel = {0.65820312,
+                               0.75048828,
+                               0.21411133,
+                               0.17370605,
+                               0.85546875,
+                               0.53076172,
+                               0.28833008,
+                               0.71044922,
+                               0.00659943};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
+
+  std::vector<float> out_features = {
+      -0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index a75ca633b05a8bf49af7842e1b96d356d5babdec..3e2ad0495f3ba85836dc08afa3f4fa4ed0b10afd 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -8,7 +8,7 @@ You may obtain a copy of the License at
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF NCHW KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
@@ -53,8 +53,8 @@ inline void CheckResult(
                          DenseTensorMeta(real_elements.dtype(),
                                          real_elements.dims(),
                                          real_elements.layout()));
-    phi::Copy(*dev_ctx_gpu, real_indices, true, &indices);
-    phi::Copy(*dev_ctx_gpu, real_elements, true, &elements);
+    phi::Copy(*dev_ctx_gpu, real_indices, indices.place(), true, &indices);
+    phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);
 
     int cmp_indices = memcmp(indices.data<IndicesT>(),
                              non_zero_indices.data(),
@@ -122,7 +122,7 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
       cuda_alloc.get(),
       DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));
 
-  phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
+  phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
   auto sparse_out =
       sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
   CheckResult<T, int64_t>(&dev_ctx_gpu,
@@ -327,9 +327,9 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
   phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
   phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
   phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  phi::Copy(dev_ctx_gpu, crows, true, &d_crows);
-  phi::Copy(dev_ctx_gpu, cols, true, &d_cols);
-  phi::Copy(dev_ctx_gpu, values, true, &d_values);
+  phi::Copy(dev_ctx_gpu, crows, d_crows.place(), true, &d_crows);
+  phi::Copy(dev_ctx_gpu, cols, d_cols.place(), true, &d_cols);
+  phi::Copy(dev_ctx_gpu, values, d_values.place(), true, &d_values);
   phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
   auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
   CheckResult<T, int64_t>(&dev_ctx_gpu,
@@ -406,9 +406,9 @@ inline void CheckCsrResult(
                          DenseTensorMeta(real_elements.dtype(),
                                          real_elements.dims(),
                                          real_elements.layout()));
-    phi::Copy(*dev_ctx_gpu, real_crows, true, &crows);
-    phi::Copy(*dev_ctx_gpu, real_cols, true, &cols);
-    phi::Copy(*dev_ctx_gpu, real_elements, true, &elements);
+    phi::Copy(*dev_ctx_gpu, real_crows, crows.place(), true, &crows);
+    phi::Copy(*dev_ctx_gpu, real_cols, cols.place(), true, &cols);
+    phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);
 
     int cmp_crows = memcmp(crows.data<IndicesT>(),
                            non_zero_crows.data(),
@@ -500,8 +500,8 @@ void TestCooToCsr(const DDim& dense_dims,
   dev_ctx_gpu.PartialInitWithAllocator();
   phi::DenseTensor d_indices(cuda_alloc.get(), indices_meta);
   phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  phi::Copy(dev_ctx_gpu, indices, true, &d_indices);
-  phi::Copy(dev_ctx_gpu, values, true, &d_values);
+  phi::Copy(dev_ctx_gpu, indices, phi::GPUPlace(), true, &d_indices);
+  phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
   phi::SparseCooTensor d_coo(d_indices, d_values, dense_dims);
   auto cuda_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_gpu, d_coo);
   CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
@@ -593,7 +593,7 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
           .GetAllocator(phi::CPUPlace())
           .get());
   dev_ctx_gpu.PartialInitWithAllocator();
-  phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
+  phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
   auto sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_gpu, d_dense_x);
 
   CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
@@ -720,8 +720,10 @@ void TestSparseCooToDense(const DDim& dense_dims,
   dev_ctx_gpu.PartialInitWithAllocator();
   DenseTensor d_dense_indices(cuda_alloc.get(), dense_indices.meta());
   DenseTensor d_dense_elements(cuda_alloc.get(), dense_elements.meta());
-  phi::Copy(dev_ctx_gpu, dense_indices, true, &d_dense_indices);
-  phi::Copy(dev_ctx_gpu, dense_elements, true, &d_dense_elements);
+  phi::Copy(
+      dev_ctx_gpu, dense_indices, phi::GPUPlace(), true, &d_dense_indices);
+  phi::Copy(
+      dev_ctx_gpu, dense_elements, phi::GPUPlace(), true, &d_dense_elements);
   SparseCooTensor coo_cuda(d_dense_indices, d_dense_elements, dense_dims);
   auto dense_out_cuda = sparse::SparseCooToDense<T>(dev_ctx_gpu, coo_cuda);
 
@@ -729,7 +731,8 @@ void TestSparseCooToDense(const DDim& dense_dims,
                           DenseTensorMeta(dense_out_cuda.dtype(),
                                           dense_out_cuda.dims(),
                                           dense_out_cuda.layout()));
-  phi::Copy(dev_ctx_gpu, dense_out_cuda, true, &h_dense_out);
+  phi::Copy(
+      dev_ctx_gpu, dense_out_cuda, h_dense_out.place(), true, &h_dense_out);
   int cmp_cuda = memcmp(
       &dense_data[0], h_dense_out.data<T>(), sizeof(T) * dense_data.size());
   ASSERT_EQ(cmp_cuda, 0);
@@ -858,13 +861,13 @@ void TestSparseCsrToDense(const DDim& dense_dims,
   phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
   phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
   phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  phi::Copy(dev_ctx_gpu, crows, true, &d_crows);
-  phi::Copy(dev_ctx_gpu, cols, true, &d_cols);
-  phi::Copy(dev_ctx_gpu, values, true, &d_values);
+  phi::Copy(dev_ctx_gpu, crows, phi::GPUPlace(), true, &d_crows);
+  phi::Copy(dev_ctx_gpu, cols, phi::GPUPlace(), true, &d_cols);
+  phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
   phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
   auto cuda_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_gpu, d_csr);
   phi::DenseTensor h_out(alloc.get(), cpu_sparse_out.meta());
-  phi::Copy(dev_ctx_gpu, cuda_sparse_out, true, &h_out);
+  phi::Copy(dev_ctx_gpu, cuda_sparse_out, phi::CPUPlace(), true, &h_out);
   int cmp_cuda =
       memcmp(h_out.data<T>(), dense_data.data(), sizeof(T) * dense_data.size());
   ASSERT_EQ(cmp_cuda, 0);
diff --git a/paddle/phi/tests/kernels/test_split_dev_api.cc b/paddle/phi/tests/kernels/test_split_dev_api.cc
index e6d6263128ec9ac91b1c65e98c476c49bdd30211..d5160933c1fa0374d8e31e52ec9d646e164025cb 100644
--- a/paddle/phi/tests/kernels/test_split_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_split_dev_api.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/split_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/tests/ops_signature/CMakeLists.txt b/paddle/phi/tests/ops/CMakeLists.txt
similarity index 100%
rename from paddle/phi/tests/ops_signature/CMakeLists.txt
rename to paddle/phi/tests/ops/CMakeLists.txt
diff --git a/paddle/phi/tests/ops_signature/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
similarity index 98%
rename from paddle/phi/tests/ops_signature/test_op_signature.cc
rename to paddle/phi/tests/ops/test_op_signature.cc
index 203517c75069db9df7a6d3329e19fcaaa39eee93..a6c9a27de7dc5a3a9f09d4c336fe9b50e4d453a5 100644
--- a/paddle/phi/tests/ops_signature/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/tests/ops_signature/test_op_signature.h"
+#include "paddle/phi/tests/ops/test_op_signature.h"
 
 #include <gtest/gtest.h>
 #include <memory>
diff --git a/paddle/phi/tests/ops_signature/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
similarity index 100%
rename from paddle/phi/tests/ops_signature/test_op_signature.h
rename to paddle/phi/tests/ops/test_op_signature.h
diff --git a/paddle/phi/tools/CMakeLists.txt b/paddle/phi/tools/CMakeLists.txt
index bc690d3a9f1932e2fa1af97b6144c08aa957cc11..5693a46d97721934ec051702740db0684e6643a8 100644
--- a/paddle/phi/tools/CMakeLists.txt
+++ b/paddle/phi/tools/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_executable(print_pten_kernels print_pten_kernels.cc)
-target_link_libraries(print_pten_kernels pten pten_api_utils)
+target_link_libraries(print_pten_kernels phi phi_api_utils)
 if(WIN32)
     target_link_libraries(print_pten_kernels shlwapi.lib)
 endif()
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index ff86e7f52d5357307b6e4d3c0a9ea78afd5e5910..8d858647ea63dc49499da6ac626b406e4653425e 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -90,7 +90,7 @@ function infrt_gen_and_build() {
         exit 7;
     fi
 
-    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
+    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7f2ad893f67a34f2bd4772614ff8f94f33f3bdb8..4d7451f435271b4aaca3010e643ddcb5fbb28191 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -713,10 +713,159 @@ EOF
     fi
 }
 
+function run_linux_cpu_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    pip install hypothesis
+    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    ut_total_startTime_s=`date +%s`
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit tests ...
+    ========================================
+EOF
+set -x
+        export TEST_NUM_PERCENT_CASES=0.15
+        bash $PADDLE_ROOT/tools/check_added_ut.sh
+        if [ -a "$PADDLE_ROOT/duplicate_ut" ];then
+            duplicate_uts=$(cat $PADDLE_ROOT/duplicate_ut|sed -e 's/\r//g')
+            if [[ "$duplicate_uts" != "" ]];then
+                set +x
+                echo "========================================"
+                echo "The new unit test has the same name as the existing unit test"
+                cat "$PADDLE_ROOT/duplicate_ut"
+                echo "========================================"
+                exit 102;
+                set -x
+            fi
+        fi
+        if [ -a "$PADDLE_ROOT/added_ut" ];then
+            added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
+            ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            ctest -R "(${added_uts})" -L "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error_1=$?
+            if [ "$added_ut_error" != 0 ] && [ "$added_ut_error_1" != 0 ];then
+                echo "========================================"
+                echo "Added UT should not exceed 15 seconds"
+                echo "========================================"
+                exit 8;
+            fi
+        fi
+set +x
+        EXIT_CODE=0;
+        
+        tmpfile_rand=`date +%s%N`
+        tmpfile=$tmp_dir/$tmpfile_rand
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut' # indicate whether the case was in quickly disable list 
+        if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
+            nightly_label="NIGHTLY_LABEL"
+        else
+            nightly_label="RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY"
+            echo "========================================="
+            echo "Unittests with nightly labels  are only run at night"
+            echo "========================================="
+        fi
+        get_precision_ut_mac
+        ut_actual_total_startTime_s=`date +%s`
+        if [[ "$on_precision" == "0" ]];then
+            ctest -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+        else
+            ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            tmpfile_rand=`date +%s%N`
+            tmpfile=$tmp_dir/$tmpfile_rand
+            ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+        fi
+
+        collect_failed_tests
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            EXIT_CODE=1
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "2" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"    
+                        retry_unittests_regular=''         
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                if [[ "$retry_unittests_regular" == "" ]];then
+                                    retry_unittests_regular="^$line$"
+                                else
+                                    retry_unittests_regular="$retry_unittests_regular|^$line$"
+                                fi
+                            done
+                        failed_test_lists=''
+                        ctest -R "$retry_unittests_regular" --output-on-failure -j 2 | tee $tmpfile
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        exec_times=$[$exec_times+1]
+                    else 
+                        break
+                    fi 
+                done
+            retry_unittests_record="$retry_unittests_record$failed_test_lists"
+        fi
+        rerun_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        ut_actual_total_endTime_s=`date +%s`
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s"
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi
+}
 function get_precision_ut_mac() {
     on_precision=0
     UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
-    precison_cases=""
+    precision_cases=""
     if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
         python3.7 $PADDLE_ROOT/tools/get_pr_ut.py
         if [[ -f "ut_list" ]]; then
@@ -2443,22 +2592,6 @@ function reuse_so_cache() {
     fi
 }
 
-function find_temporary_files() {
-    set +x
-    jsonData=`curl \
-            -H "Authorization: token ${GITHUB_API_TOKEN}"\
-            -H "Accept: application/vnd.github.v3+json" \
-            https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/files`
-    
-    result=`echo ${jsonData}|python ${PADDLE_ROOT}/tools/check_file_suffix.py`
-    
-    if [ ${#result} -gt 0 ]
-    then
-	echo ${result}
-	exit 65
-    fi
-}
-
 function trt_convert_test() {
     set +e
     cd ${PADDLE_ROOT}
@@ -2489,10 +2622,19 @@ function build_pr_and_develop() {
         rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
         rm -rf ${PADDLE_ROOT}/build/third_party
     fi
-    git checkout -b develop_base_pr upstream/$BRANCH
-    cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
-    generate_api_spec "$1" "DEV"
-    mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
+
+    git fetch upstream develop
+    dev_commit=`git log -1|head -1|awk '{print $2}'`
+    dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl"
+    url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'`
+    if [ "$url_return" == '200' ];then
+        mkdir ${PADDLE_ROOT}/build/dev_whl && wget -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
+    else
+        git checkout -b develop_base_pr upstream/$BRANCH
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        generate_api_spec "$1" "DEV"
+        mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
+    fi
 }
 
 function build_develop() {
@@ -2520,7 +2662,6 @@ function main() {
         set +e
         check_style_info=$(check_style)
         check_style_code=$?
-        find_temporary_files
         generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         check_sequence_op_unittest
@@ -2540,7 +2681,6 @@ function main() {
         ;;
       build_and_check_cpu)
         set +e
-        find_temporary_files
         generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         check_sequence_op_unittest
@@ -2691,9 +2831,11 @@ function main() {
         cmake_gen ${PYTHON_ABI:-""}
         build_mac
         ;;
-      cicheck_py35)
+      cicheck_py37)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
-        parallel_test
+        run_linux_cpu_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
+        
+        #parallel_test
         ;;
       cpu_cicheck_py35)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 4208132b98051858d3f49ecd265b9c459a94d31e..eace7c41f4a3111e637bc20f02a4a55bccd46092 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -3,4 +3,3 @@
 if(WITH_TESTING)
   cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
 endif()
-cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 2b4803e35385403badc27f416b2c7112411fd8c7..64c88a47b43936a6347b19f8d4b806f095c41da7 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1 +1,3 @@
 add_subdirectory(string)
+cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
+cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags)
diff --git a/paddle/utils/any.h b/paddle/utils/any.h
index d0e72b70635791ae8ff7edbe62e3cb0fa73a13e3..148d3f45b56ec5bedc79d296eeb87986b136ccc7 100644
--- a/paddle/utils/any.h
+++ b/paddle/utils/any.h
@@ -6,8 +6,7 @@
 
 // See http://www.boost.org/libs/any for Documentation.
 
-#ifndef PADDLE_ANY_INCLUDED
-#define PADDLE_ANY_INCLUDED
+#pragma once
 
 // what:  variant type boost::any
 // who:   contributed by Kevlin Henney,
@@ -168,12 +167,10 @@ template <typename ValueType>
 inline const ValueType *unsafe_any_cast(const any *operand) {
   return unsafe_any_cast<ValueType>(const_cast<any *>(operand));
 }
-}
+}  // namespace paddle
 
 // Copyright Kevlin Henney, 2000, 2001, 2002. All rights reserved.
 //
 // Distributed under the Boost Software License, Version 1.0. (See
 // accompanying file LICENSE_1_0.txt or copy at
 // http://www.boost.org/LICENSE_1_0.txt)
-
-#endif
diff --git a/paddle/utils/array_ref.h b/paddle/utils/array_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2ab762bb154f6336e7b203a1140fe942e3524d1
--- /dev/null
+++ b/paddle/utils/array_ref.h
@@ -0,0 +1,334 @@
+// This file copy from llvm/ADT/ArrayRef.h, version: 12.0.0
+// Modified the following points
+// 1. remove hash_value functions
+// 2. replace with the llvm::NoneType with paddle::none_t
+// 3. remove drop_while, drop_until, take_while, take_until methods
+
+//===- ArrayRef.h - Array Reference Wrapper ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "paddle/utils/none.h"
+#include "paddle/utils/small_vector.h"
+
+namespace paddle {
+
+/// ArrayRef - Represent a constant reference to an array (0 or more elements
+/// consecutively in memory), i.e. a start pointer and a length.  It allows
+/// various APIs to take consecutive elements easily and conveniently.
+///
+/// This class does not own the underlying data, it is expected to be used in
+/// situations where the data resides in some other buffer, whose lifetime
+/// extends past that of the ArrayRef. For this reason, it is not in general
+/// safe to store an ArrayRef.
+///
+/// This is intended to be trivially copyable, so it should be passed by
+/// value.
+template <typename T>
+class ArrayRef {
+ public:
+  using iterator = const T *;
+  using const_iterator = const T *;
+  using size_type = size_t;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ private:
+  /// The start of the array, in an external buffer.
+  const T *Data = nullptr;
+
+  /// The number of elements.
+  size_type Length = 0;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty ArrayRef.
+  /*implicit*/ ArrayRef() = default;
+
+  /// Construct an empty ArrayRef from None.
+  /*implicit*/ ArrayRef(none_t) {}
+
+  /// Construct an ArrayRef from a single element.
+  /*implicit*/ ArrayRef(const T &OneElt) : Data(&OneElt), Length(1) {}
+
+  /// Construct an ArrayRef from a pointer and length.
+  /*implicit*/ ArrayRef(const T *data, size_t length)
+      : Data(data), Length(length) {}
+
+  /// Construct an ArrayRef from a range.
+  ArrayRef(const T *begin, const T *end) : Data(begin), Length(end - begin) {}
+
+  /// Construct an ArrayRef from a SmallVector. This is templated in order to
+  /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+  /// copy-construct an ArrayRef.
+  template <typename U>
+  /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// Construct an ArrayRef from a std::vector.
+  template <typename A>
+  /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// Construct an ArrayRef from a std::array
+  template <size_t N>
+  /*implicit*/ constexpr ArrayRef(const std::array<T, N> &Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct an ArrayRef from a C array.
+  template <size_t N>
+  /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+  /// Construct an ArrayRef from a std::initializer_list.
+  /*implicit*/ ArrayRef(const std::initializer_list<T> &Vec)
+      : Data(Vec.begin() == Vec.end() ? (T *)nullptr : Vec.begin()),
+        Length(Vec.size()) {}
+
+  /// Construct an ArrayRef<const T*> from ArrayRef<T*>. This uses SFINAE to
+  /// ensure that only ArrayRefs of pointers can be converted.
+  template <typename U>
+  ArrayRef(const ArrayRef<U *> &A,
+           std::enable_if_t<std::is_convertible<U *const *, T const *>::value>
+               * = nullptr)
+      : Data(A.data()), Length(A.size()) {}
+
+  /// Construct an ArrayRef<const T*> from a SmallVector<T*>. This is
+  /// templated in order to avoid instantiating SmallVectorTemplateCommon<T>
+  /// whenever we copy-construct an ArrayRef.
+  template <typename U, typename DummyT>
+  /*implicit*/ ArrayRef(
+      const SmallVectorTemplateCommon<U *, DummyT> &Vec,
+      std::enable_if_t<std::is_convertible<U *const *, T const *>::value> * =
+          nullptr)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// Construct an ArrayRef<const T*> from std::vector<T*>. This uses SFINAE
+  /// to ensure that only vectors of pointers can be converted.
+  template <typename U, typename A>
+  ArrayRef(
+      const std::vector<U *, A> &Vec,
+      std::enable_if_t<std::is_convertible<U *const *, T const *>::value> * = 0)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  iterator begin() const { return Data; }
+  iterator end() const { return Data + Length; }
+
+  reverse_iterator rbegin() const { return reverse_iterator(end()); }
+  reverse_iterator rend() const { return reverse_iterator(begin()); }
+
+  /// empty - Check if the array is empty.
+  bool empty() const { return Length == 0; }
+
+  const T *data() const { return Data; }
+
+  /// size - Get the array size.
+  size_t size() const { return Length; }
+
+  /// front - Get the first element.
+  const T &front() const {
+    assert(!empty());
+    return Data[0];
+  }
+
+  /// back - Get the last element.
+  const T &back() const {
+    assert(!empty());
+    return Data[Length - 1];
+  }
+
+  // copy - Allocate copy in Allocator and return ArrayRef<T> to it.
+  template <typename Allocator>
+  ArrayRef<T> copy(Allocator &A) {
+    T *Buff = A.template Allocate<T>(Length);
+    std::uninitialized_copy(begin(), end(), Buff);
+    return ArrayRef<T>(Buff, Length);
+  }
+
+  /// equals - Check for element-wise equality.
+  bool equals(ArrayRef RHS) const {
+    if (Length != RHS.Length) return false;
+    return std::equal(begin(), end(), RHS.begin());
+  }
+
+  /// slice(n, m) - Chop off the first N elements of the array, and keep M
+  /// elements in the array.
+  ArrayRef<T> slice(size_t N, size_t M) const {
+    assert(N + M <= size() && "Invalid specifier");
+    return ArrayRef<T>(data() + N, M);
+  }
+
+  /// slice(n) - Chop off the first N elements of the array.
+  ArrayRef<T> slice(size_t N) const { return slice(N, size() - N); }
+
+  /// Drop the first \p N elements of the array.
+  ArrayRef<T> drop_front(size_t N = 1) const {
+    assert(size() >= N && "Dropping more elements than exist");
+    return slice(N, size() - N);
+  }
+
+  /// Drop the last \p N elements of the array.
+  ArrayRef<T> drop_back(size_t N = 1) const {
+    assert(size() >= N && "Dropping more elements than exist");
+    return slice(0, size() - N);
+  }
+
+  /// Return a copy of *this with only the first \p N elements.
+  ArrayRef<T> take_front(size_t N = 1) const {
+    if (N >= size()) return *this;
+    return drop_back(size() - N);
+  }
+
+  /// Return a copy of *this with only the last \p N elements.
+  ArrayRef<T> take_back(size_t N = 1) const {
+    if (N >= size()) return *this;
+    return drop_front(size() - N);
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  const T &operator[](size_t Index) const {
+    assert(Index < Length && "Invalid index!");
+    return Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same<U, T>::value, ArrayRef<T>> &operator=(
+      U &&Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same<U, T>::value, ArrayRef<T>> &operator=(
+      std::initializer_list<U>) = delete;
+
+  /// @}
+  /// @name Expensive Operations
+  /// @{
+  std::vector<T> vec() const { return std::vector<T>(Data, Data + Length); }
+
+  /// @}
+  /// @name Conversion operators
+  /// @{
+  operator std::vector<T>() const {
+    return std::vector<T>(Data, Data + Length);
+  }
+
+  /// @}
+};
+
+/// @name ArrayRef Convenience constructors
+/// @{
+
+/// Construct an ArrayRef from a single element.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T &OneElt) {
+  return OneElt;
+}
+
+/// Construct an ArrayRef from a pointer and length.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T *data, size_t length) {
+  return ArrayRef<T>(data, length);
+}
+
+/// Construct an ArrayRef from a range.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T *begin, const T *end) {
+  return ArrayRef<T>(begin, end);
+}
+
+/// Construct an ArrayRef from a SmallVector.
+template <typename T>
+ArrayRef<T> makeArrayRef(const SmallVectorImpl<T> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a SmallVector.
+template <typename T, unsigned N>
+ArrayRef<T> makeArrayRef(const SmallVector<T, N> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a std::vector.
+template <typename T>
+ArrayRef<T> makeArrayRef(const std::vector<T> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a std::array.
+template <typename T, std::size_t N>
+ArrayRef<T> makeArrayRef(const std::array<T, N> &Arr) {
+  return Arr;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op) (const)
+template <typename T>
+ArrayRef<T> makeArrayRef(const ArrayRef<T> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op)
+template <typename T>
+ArrayRef<T> &makeArrayRef(ArrayRef<T> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a C array.
+template <typename T, size_t N>
+ArrayRef<T> makeArrayRef(const T (&Arr)[N]) {
+  return ArrayRef<T>(Arr);
+}
+
+/// @}
+/// @name ArrayRef Comparison Operators
+/// @{
+
+template <typename T>
+inline bool operator==(ArrayRef<T> LHS, ArrayRef<T> RHS) {
+  return LHS.equals(RHS);
+}
+
+template <typename T>
+inline bool operator==(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
+  return ArrayRef<T>(LHS).equals(RHS);
+}
+
+template <typename T>
+inline bool operator!=(ArrayRef<T> LHS, ArrayRef<T> RHS) {
+  return !(LHS == RHS);
+}
+
+template <typename T>
+inline bool operator!=(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
+  return !(LHS == RHS);
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/array_ref_test.cc b/paddle/utils/array_ref_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33a09c499246d123d43fea36ef1e0c0faa841236
--- /dev/null
+++ b/paddle/utils/array_ref_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/utils/array_ref.h"
+
+#include <cstdlib>
+#include <ctime>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(array_ref, array_ref) {
+  paddle::ArrayRef<int> a;
+  CHECK_EQ(a.size(), size_t(0));
+  CHECK_EQ(a.data(), static_cast<int*>(nullptr));
+
+  paddle::ArrayRef<int> b(paddle::none);
+  CHECK_EQ(b.size(), size_t(0));
+  CHECK_EQ(b.data(), static_cast<int*>(nullptr));
+
+  int v = 1;
+  paddle::ArrayRef<int> c(v);
+  CHECK_EQ(c.size(), size_t(1));
+  CHECK_EQ(c.data(), &v);
+  CHECK_EQ(c.equals(paddle::makeArrayRef(v)), true);
+
+  int v1[5] = {1, 2, 3, 4, 5};
+  paddle::ArrayRef<int> d(v1, 5);
+  CHECK_EQ(d.size(), size_t(5));
+  CHECK_EQ(d.data(), v1);
+  CHECK_EQ(d.equals(paddle::makeArrayRef(v1, 5)), true);
+
+  paddle::ArrayRef<int> e(&v1[0], &v1[4]);
+  CHECK_EQ(e.size(), size_t(4));
+  CHECK_EQ(e.data(), v1);
+  CHECK_EQ(e.equals(paddle::makeArrayRef(&v1[0], &v1[4])), true);
+
+  paddle::SmallVector<int, 3> small_vector{1, 2, 3};
+  paddle::ArrayRef<int> f(small_vector);
+  CHECK_EQ(f.size(), size_t(3));
+  CHECK_EQ(f.data(), small_vector.data());
+  CHECK_EQ(f.equals(paddle::makeArrayRef(small_vector)), true);
+
+  std::vector<int> vector{1, 2, 3};
+  paddle::ArrayRef<int> g(vector);
+  CHECK_EQ(g.size(), size_t(3));
+  CHECK_EQ(g.data(), vector.data());
+  CHECK_EQ(g.equals(paddle::makeArrayRef(vector)), true);
+
+  std::initializer_list<int> list = {1, 2, 3};
+  paddle::ArrayRef<int> h(list);
+  CHECK_EQ(h.size(), size_t(3));
+  CHECK_EQ(h.data(), list.begin());
+
+  paddle::ArrayRef<int> i(h);
+  CHECK_EQ(i.size(), size_t(3));
+  CHECK_EQ(i.data(), list.begin());
+  CHECK_EQ(i.equals(h), true);
+  CHECK_EQ(i.equals(paddle::makeArrayRef(h)), true);
+
+  auto slice = i.slice(1, 2);
+  CHECK_EQ(slice.size(), size_t(2));
+  CHECK_EQ(slice[0], 2);
+  CHECK_EQ(slice[1], 3);
+
+  auto drop = i.drop_front(2);
+  CHECK_EQ(drop.size(), size_t(1));
+  CHECK_EQ(drop[0], 3);
+
+  paddle::ArrayRef<int> nums = {1, 2, 3, 4, 5, 6, 7, 8};
+  auto front = nums.take_front(3);
+  CHECK_EQ(front.size(), size_t(3));
+  for (size_t i = 0; i < 3; ++i) {
+    CHECK_EQ(front[i], nums[i]);
+  }
+  auto back = nums.take_back(3);
+  CHECK_EQ(back.size(), size_t(3));
+  for (size_t i = 0; i < 3; ++i) {
+    CHECK_EQ(back[i], nums[i + 5]);
+  }
+}
diff --git a/paddle/utils/flat_hash_map.h b/paddle/utils/flat_hash_map.h
index 07b7b5d3c821c81229d8fe22e1844cb34adc826f..64a75fffa57673c6d4e738a0d86a90f24b1f932e 100644
--- a/paddle/utils/flat_hash_map.h
+++ b/paddle/utils/flat_hash_map.h
@@ -1741,4 +1741,4 @@ struct power_of_two_std_hash : std::hash<T> {
   typedef paddle::power_of_two_hash_policy hash_policy;
 };
 
-}  // end namespace paddle
+}  // namespace paddle
diff --git a/paddle/utils/none.h b/paddle/utils/none.h
index 20d6f4d2c7ddea8970a1631077d780f7c930c752..d2da8f26a118fce0a24c379f8bc483c1d03ac554 100644
--- a/paddle/utils/none.h
+++ b/paddle/utils/none.h
@@ -15,8 +15,7 @@
 // You are welcome to contact the author at:
 //  fernando_cacciola@hotmail.com
 //
-#ifndef PADDLE_NONE_17SEP2003_HPP
-#define PADDLE_NONE_17SEP2003_HPP
+#pragma once
 
 namespace paddle {
 
@@ -26,7 +25,7 @@ struct none_helper {};
 
 typedef int detail::none_helper::*none_t;
 
-}  // namespace boost
+}  // namespace paddle
 
 // NOTE: Borland users have to include this header outside any precompiled
 // headers
@@ -37,6 +36,4 @@ namespace paddle {
 
 none_t const none = ((none_t)0);
 
-}  // namespace boost
-
-#endif
+}  // namespace paddle
diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h
index 00d8ae28ee836ab610a414198eeff74e13a89834..eec5f32be722617f87a85913be4b55e90d2929e7 100644
--- a/paddle/utils/optional.h
+++ b/paddle/utils/optional.h
@@ -17,16 +17,18 @@
 // You are welcome to contact the author at:
 //  fernando_cacciola@hotmail.com
 //
-#ifndef PADDLE_OPTIONAL_OPTIONAL_FLC_19NOV2002_HPP
-#define PADDLE_OPTIONAL_OPTIONAL_FLC_19NOV2002_HPP
+#pragma once
 
 #include <algorithm>
+#include <cassert>
 #include <functional>
 #include <new>
 #include <type_traits>
 
 #include "none.h"
 
+namespace paddle {
+
 // Daniel Wallin discovered that bind/apply.hpp badly interacts with the apply<>
 // member template of a factory as used in the optional<> implementation.
 // He proposed this simple fix which is to move the call to apply<> outside
@@ -38,7 +40,6 @@ void construct(Factory const& factory, void* address) {
 }
 }
 
-namespace paddle {
 template <typename T>
 class optional;
 
@@ -865,5 +866,3 @@ inline void optional_swap(optional<T>& x, optional<T>& y) {
 }  // namespace optional_detail
 
 }  // namespace paddle
-
-#endif
diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h
index 48af2491b89f877249bdebaccd9010c07b69bdcf..14cb8f410f46036e90c623889ed79a3725dd924d 100644
--- a/paddle/utils/small_vector.h
+++ b/paddle/utils/small_vector.h
@@ -18,8 +18,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PADDLE_UTILS_SMALL_VECTOR_H_
-#define PADDLE_UTILS_SMALL_VECTOR_H_
+#pragma once
 
 #include <algorithm>
 #include <cassert>
@@ -1461,7 +1460,7 @@ static_assert(sizeof(SmallVectorSizeType<char>) == sizeof(uint32_t),
               "Expected SmallVectorBase<uint32_t> variant to be in use.");
 #endif
 
-}  // end namespace paddle
+}  // namespace paddle
 
 namespace std {
 
@@ -1479,6 +1478,4 @@ inline void swap(paddle::SmallVector<T, N> &LHS,
   LHS.swap(RHS);
 }
 
-}  // end namespace std
-
-#endif  // PADDLE_UTILS_SMALL_VECTOR_H_
+}  // namespace std
diff --git a/paddle/testing/small_vector_test.cc b/paddle/utils/small_vector_test.cc
similarity index 100%
rename from paddle/testing/small_vector_test.cc
rename to paddle/utils/small_vector_test.cc
diff --git a/paddle/utils/string/tinyformat/tinyformat.h b/paddle/utils/string/tinyformat/tinyformat.h
index 28a444f87c48fdde7d41aa257fe0e91538c9b7a7..4e46cbc26b6380687639de140333c123662543b5 100644
--- a/paddle/utils/string/tinyformat/tinyformat.h
+++ b/paddle/utils/string/tinyformat/tinyformat.h
@@ -133,6 +133,8 @@
 #include <iostream>
 #include <sstream>
 
+#include "paddle/utils/string/to_string.h"
+
 namespace paddle {
 namespace string {
 namespace tinyformat {
diff --git a/paddle/utils/string/to_string.h b/paddle/utils/string/to_string.h
index 7b3332861e0fa3edbbb8915e3e3f068fed3b412f..3cec88a4571b6bf50ccebb7f9b2c6224f42166e8 100644
--- a/paddle/utils/string/to_string.h
+++ b/paddle/utils/string/to_string.h
@@ -56,5 +56,26 @@ inline std::string to_string(const char* v) {
   return std::string(v);
 }
 
+inline std::ostream& operator<<(std::ostream& os,
+                                const std::vector<std::vector<size_t>>& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    bool is_first = true;
+    for (auto& i : v) {
+      if (is_first) {
+        os << i;
+        is_first = false;
+      } else {
+        os << ", " << i;
+      }
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
 }  // namespace string
 }  // namespace paddle
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 12d31aee41e394968d58753f2b54fcce8648a35e..bba9c226dc07b68a9ee9835486658478cc8a59b9 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -22,23 +22,32 @@ except ImportError:
                      )
 
 from .batch import batch  # noqa: F401
-from .fluid import monkey_patch_variable
-from .fluid.dygraph import monkey_patch_math_varbase
+from .framework import monkey_patch_variable
+from .framework import monkey_patch_math_varbase
 monkey_patch_variable()
 monkey_patch_math_varbase()
+
+from .framework import disable_signal_handler  # noqa: F401
+from .framework import get_flags  # noqa: F401
+from .framework import set_flags  # noqa: F401
+
+from .framework import disable_static  # noqa: F401
+from .framework import enable_static  # noqa: F401
+from .framework import in_dynamic_mode  # noqa: F401
+
 from .framework.dtype import dtype as dtype  # noqa: F401
-from paddle.framework.dtype import uint8  # noqa: F401
-from paddle.framework.dtype import int8  # noqa: F401
-from paddle.framework.dtype import int16  # noqa: F401
-from paddle.framework.dtype import int32  # noqa: F401
-from paddle.framework.dtype import int64  # noqa: F401
-from paddle.framework.dtype import float16  # noqa: F401
-from paddle.framework.dtype import float32  # noqa: F401
-from paddle.framework.dtype import float64  # noqa: F401
-from paddle.framework.dtype import bfloat16  # noqa: F401
-from paddle.framework.dtype import bool  # noqa: F401
-from paddle.framework.dtype import complex64  # noqa: F401
-from paddle.framework.dtype import complex128  # noqa: F401
+from .framework.dtype import uint8  # noqa: F401
+from .framework.dtype import int8  # noqa: F401
+from .framework.dtype import int16  # noqa: F401
+from .framework.dtype import int32  # noqa: F401
+from .framework.dtype import int64  # noqa: F401
+from .framework.dtype import float16  # noqa: F401
+from .framework.dtype import float32  # noqa: F401
+from .framework.dtype import float64  # noqa: F401
+from .framework.dtype import bfloat16  # noqa: F401
+from .framework.dtype import bool  # noqa: F401
+from .framework.dtype import complex64  # noqa: F401
+from .framework.dtype import complex128  # noqa: F401
 from .framework import VarBase as Tensor  # noqa: F401
 Tensor.__qualname__ = 'Tensor'  # noqa: F401
 import paddle.compat  # noqa: F401
@@ -142,6 +151,7 @@ from .tensor.manipulation import scatter_nd_add  # noqa: F401
 from .tensor.manipulation import scatter_nd  # noqa: F401
 from .tensor.manipulation import shard_index  # noqa: F401
 from .tensor.manipulation import slice  # noqa: F401
+from .tensor.manipulation import crop  # noqa: F401
 from .tensor.manipulation import split  # noqa: F401
 from .tensor.manipulation import squeeze  # noqa: F401
 from .tensor.manipulation import squeeze_  # noqa: F401
@@ -316,23 +326,15 @@ from .tensor.stat import quantile  # noqa: F401
 from .device import get_cudnn_version  # noqa: F401
 from .device import set_device  # noqa: F401
 from .device import get_device  # noqa: F401
-from .fluid.framework import is_compiled_with_cinn  # noqa: F401
-from .fluid.framework import is_compiled_with_cuda  # noqa: F401
-from .fluid.framework import is_compiled_with_rocm  # noqa: F401
-from .fluid.framework import disable_signal_handler  # noqa: F401
-from .fluid.framework import get_flags  # noqa: F401
-from .fluid.framework import set_flags  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
 from .device import is_compiled_with_npu  # noqa: F401
 from .device import is_compiled_with_ipu  # noqa: F401
 from .device import is_compiled_with_mlu  # noqa: F401
+from .device import is_compiled_with_cinn  # noqa: F401
+from .device import is_compiled_with_cuda  # noqa: F401
+from .device import is_compiled_with_rocm  # noqa: F401
 from .device import XPUPlace  # noqa: F401
 
-from .fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
-from .fluid.dygraph.base import disable_dygraph as enable_static  # noqa: F401
-from .fluid.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
-from .fluid.layers import crop_tensor as crop  # noqa: F401
-
 # high-level api
 from .hapi import Model  # noqa: F401
 from . import callbacks  # noqa: F401
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 5a22d22151a1cd12b68fc3672faec965f399d5fd..26740dfd0f6dbbce655079566dfe046139661b02 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -14,6 +14,8 @@
 
 import paddle
 from paddle.fluid.framework import dygraph_only
+from paddle.fluid.dygraph.amp.auto_cast import amp_state
+from paddle.amp.auto_cast import auto_cast
 from paddle.fluid import core
 __all__ = []
 
@@ -46,6 +48,7 @@ class PyLayerContext(object):
 
     def __init__(self):
         self.container = None
+        self._amp_state = amp_state()
 
     def save_for_backward(self, *tensors):
         """
@@ -178,6 +181,13 @@ class PyLayerBackward(PyLayerContext):
     def backward(self, *args, **kwargs):
         with paddle.fluid.dygraph.guard():
             with paddle.fluid.dygraph.no_grad():
+                if self._amp_state and 'enable' in self._amp_state and self._amp_state[
+                        'enable']:
+                    with auto_cast(**args[0]._amp_state):
+                        return self._forward_cls.backward(*args, **kwargs)
+                else:
+
+                    return self._forward_cls.backward(*args, **kwargs)
                 return self._forward_cls.backward(*args, **kwargs)
 
 
diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/cost_model.py
index 1155c2817a21cd147ee1012fbaf11376a5183717..b72c044428f6cdde92756ff24d76939b759aebb4 100644
--- a/python/paddle/distributed/auto_parallel/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/cost_model.py
@@ -426,7 +426,7 @@ class CostModel(object):
         return merged_node_id, merged_node
 
     def merge_linear(self):
-        '''
+        r'''
         This method does the following: 
         If X depends on Y only, they must be run sequentially.
             [ e.g. A ->- C ->- D   D and E depends on C only.] 
@@ -442,7 +442,7 @@ class CostModel(object):
         return cnt
 
     def merge_branch(self):
-        '''
+        r'''
         This method does the following:
         If a node has more than one successor, there is *branch*.
             [ e.g. A ->- B ->- D                                       ] 
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index e06811df88179a089d6253f91c60bd3c9b751599..573f23fdca519ae1da10d62ef7eb2da6238805f3 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -45,12 +45,17 @@ class DistributedContext:
     One auto-parallel run should use its own DistributedContext to avoid interfering other run.
     """
 
-    def __init__(self, program=None):
+    def __init__(self,
+                 serial_main_prog=None,
+                 serial_startup_prog=None,
+                 dist_main_progs=None,
+                 dist_startup_progs=None):
         # Program related data members
-        self._serial_program = program
+        self._serial_program = serial_main_prog
         self._is_initialized_for_program = False
         self._dist_tensors_for_program = {}
         self._dist_ops_for_program = {}
+        self._block_state = BlockState()
         # Graph related data members
         self._is_initialized_for_graph = False
         self._serial_graph = None
@@ -65,8 +70,12 @@ class DistributedContext:
         self._tensor_id_to_tensor_node_ids = {}
 
         # Distributed programs
-        self._dist_main_programs = {}
-        self._dist_startup_programs = {}
+        self._dist_main_programs = dist_main_progs
+        if not self._dist_main_programs:
+            self._dist_main_programs = {}
+        self._dist_startup_programs = dist_startup_progs
+        if not self._dist_startup_programs:
+            self._dist_startup_programs = {}
 
     @property
     def serial_program(self):
@@ -78,8 +87,8 @@ class DistributedContext:
 
     @serial_program.setter
     def serial_program(self, program):
-        assert self._serial_program is None, \
-            "This distributed context has already been realted to a serial program"
+        # assert self._serial_program is None, \
+        #     "This distributed context has already been realted to a serial program"
         self._serial_program = program
 
     @property
@@ -94,6 +103,10 @@ class DistributedContext:
     def dist_op_context(self):
         return self._dist_op_context
 
+    @property
+    def block_state(self):
+        return self._block_state
+
     @property
     def dist_main_programs(self):
         return self._dist_main_programs
@@ -504,66 +517,83 @@ class DistributedOperatorContext:
 
     def __init__(self):
         self._dst_main_program = None
+        self._main_block = None
         self._dst_startup_program = None
-        self._varname_mapping = None
-        self._rank_id = None
+        self._startup_block = None
         self._cur_src_op = None
         self._cur_dist_attr = None
         self.grad_op_id_to_op_id = {}
+        self._work_block = None
         self.already_init_sync_vars = set()
+        self.varname_mapping = None
+        self.rank_id = None
 
     def __deepcopy__(self, memo):
         cls = self.__class__
         result = cls.__new__(cls)
         memo[id(self)] = result
         for k, v in self.__dict__.items():
-            if k == "_dst_main_program" or k == "_dst_startup_program" or k == "_cur_src_op":
+            if k in [
+                    "_dst_main_program", "_dst_startup_program", "_cur_src_op",
+                    "_work_block", "_main_block", "_startup_block"
+            ]:
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
         return result
 
-    def set_dst_main_program(self, prog):
-        self._dst_main_program = prog
-
-    def get_dst_main_program(self):
+    @property
+    def dst_main_program(self):
         return self._dst_main_program
 
-    def set_dst_startup_program(self, prog):
-        self._dst_startup_program = prog
+    @dst_main_program.setter
+    def dst_main_program(self, prog):
+        self._dst_main_program = prog
+        self._main_block = prog.blocks[0]
 
-    def get_dst_startup_program(self):
-        return self._dst_startup_program
+    @property
+    def main_block(self):
+        return self._main_block
 
-    def set_varname_mapping(self, mapping):
-        self._varname_mapping = mapping
+    @property
+    def dst_startup_program(self):
+        return self._dst_startup_program
 
-    def get_varname_mapping(self):
-        return self._varname_mapping
+    @dst_startup_program.setter
+    def dst_startup_program(self, prog):
+        self._dst_startup_program = prog
+        self._startup_block = prog.blocks[0]
 
-    def set_rank_id(self, rank_id):
-        self._rank_id = rank_id
+    @property
+    def startup_block(self):
+        return self._startup_block
 
-    def get_rank_id(self):
-        return self._rank_id
+    @property
+    def work_block(self):
+        assert self._work_block is not None
+        return self._work_block
 
-    def set_cur_src_op(self, cur_src_op):
-        self._cur_src_op = cur_src_op
+    @work_block.setter
+    def work_block(self, block):
+        assert block is not None
+        self._work_block = block
 
-    def get_cur_src_op(self):
+    @property
+    def cur_src_op(self):
+        assert self._cur_src_op is not None
         return self._cur_src_op
 
     def prepare_context(self, src_op):
 
-        self.set_cur_src_op(src_op)
+        self._cur_src_op = src_op
 
         # build input varname mapping
         kinputs = {}
         for input_name in src_op.desc.input_names():
             varnames = []
             for varname in src_op.desc.input(input_name):
-                assert varname in self._varname_mapping
-                varnames.append(self._varname_mapping[varname])
+                assert varname in self.varname_mapping
+                varnames.append(self.varname_mapping[varname])
             kinputs[input_name] = varnames
 
         # build output varname mapping
@@ -571,8 +601,52 @@ class DistributedOperatorContext:
         for output_name in src_op.desc.output_names():
             varnames = []
             for varname in src_op.desc.output(output_name):
-                assert varname in self._varname_mapping
-                varnames.append(self._varname_mapping[varname])
+                assert varname in self.varname_mapping
+                varnames.append(self.varname_mapping[varname])
             koutputs[output_name] = varnames
 
         return kinputs, koutputs
+
+
+class BlockState(object):
+    def __init__(self):
+        self.nblock = 0
+        self.forward_indices = []
+        self.backward_indices = []
+        self.backward_to_forward_index_map = {}
+
+    def parse_forward_blocks(self, program):
+
+        while program.current_block_idx != 0:
+            program._rollback()
+
+        assert program.current_block_idx == 0
+
+        for idx, block in enumerate(program.blocks):
+
+            assert idx == block.idx, "index doesn't match"
+            assert block.forward_block_idx == -1, "forward_block_idx of forward block [{}] is not [{}]".format(
+                idx, block.forward_block_idx)
+            self.forward_indices.append(idx)
+            self.nblock += 1
+
+        assert self.nblock >= 1
+
+    def parse_backward_blocks(self, program):
+
+        assert 0 in self.forward_indices, "forward block idx are{}".format(
+            self.forward_indices)
+        self.backward_to_forward_index_map[0] = 0
+
+        for idx, block in enumerate(program.blocks):
+
+            if idx < len(self.forward_indices):
+                continue
+
+            assert idx == block.idx, "index doesn't match"
+            assert block.forward_block_idx in self.forward_indices
+            self.backward_indices.append(idx)
+            self.backward_to_forward_index_map[idx] = block.forward_block_idx
+            self.nblock += 1
+
+        assert self.nblock == len(program.blocks)
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..92deeffd2c9014dfff3bc023359b7f157e47027e
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import abc
+import numpy as np
+import paddle
+from paddle.io import DataLoader, DistributedBatchSampler
+
+
+class DistributedDataLoader(metaclass=abc.ABCMeta):
+    def __init__(self,
+                 dataset,
+                 batch_size=1,
+                 epochs=1,
+                 data_parallel_world_size=None,
+                 data_parallel_rank=None,
+                 drop_last=False):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.epochs = epochs
+        self.data_parallel_world_size = data_parallel_world_size
+        self.data_parallel_rank = data_parallel_rank
+        self.drop_lost = drop_last
+        if data_parallel_world_size is not None:
+            assert batch_size % data_parallel_world_size == 0
+
+    @abc.abstractmethod
+    def __iter__(self):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def __next__(self):
+        raise NotImplementedError
+
+
+class NonIterableGeneratorLoader(DistributedDataLoader):
+    def __init__(self,
+                 dataset,
+                 feed_list,
+                 places,
+                 batch_size=1,
+                 epochs=1,
+                 steps_per_epoch=1000,
+                 data_parallel_world_size=None,
+                 data_parallel_rank=None,
+                 drop_last=False):
+        self.feed_list = feed_list
+        self.places = places
+        self.steps_per_epoch = steps_per_epoch
+        super(NonIterableGeneratorLoader, self).__init__(
+            dataset, batch_size, epochs, data_parallel_world_size,
+            data_parallel_rank, drop_last)
+        self._inner_dataloader = self._create_inner_dataloader()
+
+    def __iter__(self):
+        self._cur_step = 0
+        self._inner_dataloader.start()
+        return self
+
+    def __next__(self):
+        if self._cur_step < self.steps_per_epoch:
+            self._cur_step += 1
+        else:
+            self._inner_dataloader.reset()
+            raise StopIteration
+
+    def _create_inner_dataloader(self):
+        def data_generator():
+            batch_data = None
+            for step, data in enumerate(self.dataset):
+                if batch_data is None:
+                    batch_data = [[] for i in range(len(data))]
+                for idx, data_item in enumerate(data):
+                    batch_data[idx].append(np.array(data_item))
+                if (step + 1) % self.batch_size == 0:
+                    yield batch_data[0], batch_data[1]
+                    batch_data = None
+
+        dataloader = paddle.fluid.io.DataLoader.from_generator(
+            feed_list=self.feed_list, capacity=70, iterable=False)
+        dataloader.set_batch_generator(data_generator, self.places)
+        return dataloader
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..8efb9eb719237fd433b9fb02b0772eb6581319e4
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+from collections import defaultdict
+
+import paddle
+from paddle import fluid
+from paddle.io import Dataset
+from paddle.fluid.backward import append_backward
+import paddle.fluid.core as core
+from paddle.static import InputSpec
+from paddle.fluid import program_guard
+from paddle.fluid.framework import Operator
+from paddle.fluid.framework import _current_expected_place as _get_device
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.distributed.passes import new_pass, PassContext
+from paddle.distributed.utils import get_logger
+
+from .dist_loader import NonIterableGeneratorLoader
+from .dist_op import DistributedOperator
+from .dist_tensor import DistributedTensor
+from .dist_context import DistributedContext
+from .dist_context import get_default_distributed_context
+from .dist_context import set_default_distributed_context
+from .process_group import get_all_process_groups
+from .process_group import get_process_group
+from .process_group import get_world_process_group
+from .process_group import _g_process_group_map, ProcessGroup
+from .completion import Completer
+from .partitioner import Partitioner
+from .reshard import reshard, HAS_SENT, HAS_RECV, HAS_ALLGATHER
+from .cluster import Cluster
+from .mapper import mapping
+from .planner import Planner
+from .utils import make_data_unshard
+from .utils import set_grad_var_shape
+from .utils import print_program_with_dist_attr
+from .utils import SerialProgramInfo
+
+paddle.enable_static()
+
+
+def to_list(value):
+    if value is None:
+        return value
+    if isinstance(value, (list, tuple)):
+        return list(value)
+    return [value]
+
+
+class Engine:
+    def __init__(self, model=None, data_spec=None, cluster=None, strategy=None):
+        self.model = model
+        self.data_spec = data_spec
+        self.cluster = cluster
+        self.strategy = strategy
+        self._executor = None
+        self._orig_main_prog = fluid.default_main_program()
+        self._orig_startup_prog = fluid.default_startup_program()
+        self._serial_main_progs = {}
+        self._serial_startup_progs = {}
+        self._dist_main_progs = defaultdict(dict)
+        self._dist_startup_progs = defaultdict(dict)
+        self._orig_dist_context = get_default_distributed_context()
+        self._dist_contexts = {}
+        self._pass_contexts = {}
+        self._cur_rank = paddle.distributed.get_rank()
+        self._logger = get_logger(logging.INFO)
+
+    def prepare(self,
+                optimizer=None,
+                loss=None,
+                metrics=None,
+                mode="train",
+                all_ranks=False):
+        self.optimizer = optimizer
+        self.loss = loss
+        self.metrics = metrics
+        self.mode = mode
+        self._build()
+        self._plan()
+        if not all_ranks:
+            self._parallel(self._cur_rank)
+        else:
+            world_process_group = get_world_process_group()
+            all_ranks = world_process_group.ranks
+            for rank in all_ranks:
+                self._parallel(rank)
+        place = _get_device()
+        if isinstance(place, fluid.CUDAPlace):
+            self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
+        if self._executor is None:
+            self._executor = fluid.Executor(place)
+
+    def _build(self):
+        serial_main_prog = self._serial_main_progs.get(self.mode, None)
+        if serial_main_prog is not None:
+            return
+
+        serial_main_prog = self._orig_main_prog.clone()
+        serial_startup_prog = self._orig_startup_prog.clone()
+        with fluid.program_guard(serial_main_prog, serial_startup_prog):
+            inputs_spec = self.data_spec[0]
+            labels_spec = self.data_spec[1]
+            inputs = [s._create_feed_layer() for s in to_list(inputs_spec)]
+            labels = [s._create_feed_layer() for s in to_list(labels_spec)]
+            self._input_vars = inputs
+            self._label_vars = labels
+            feed_list = self._input_vars + self._label_vars
+            outputs = to_list(self.model(*inputs))
+            if self.mode != "predict" and self.loss:
+                loss = self.loss(*(outputs + labels))
+                self._loss_var = loss
+
+        self._serial_main_progs[self.mode] = serial_main_prog
+        self._serial_startup_progs[self.mode] = serial_startup_prog
+        self._dist_contexts[self.mode] = DistributedContext(
+            serial_main_prog, serial_startup_prog,
+            self._dist_main_progs[self.mode],
+            self._dist_startup_progs[self.mode])
+        self._pass_contexts[self.mode] = PassContext()
+
+    def _plan(self):
+        # Complete the distributed annotation
+        serial_main_prog = self._serial_main_progs[self.mode]
+        self._completer = Completer(self._dist_contexts[self.mode])
+        self._completer.complete_forward_annotation(serial_main_prog)
+        # TODO: add auto planner process
+        # parse forward sub block
+        self._dist_contexts[self.mode].block_state.parse_forward_blocks(
+            serial_main_prog)
+
+    def _parallel(self, rank):
+        serial_main_program = self._serial_main_progs[self.mode]
+        serial_startup_program = self._serial_startup_progs[self.mode]
+        dist_context = self._dist_contexts[self.mode]
+        if self.mode != "predict" and self.loss:
+            # Generate backward
+            serial_loss = self._loss_var
+            params_grads = self._generate_backward(
+                serial_main_program, serial_startup_program, serial_loss)
+            # Apply pre optimization passes
+            self._apply_pre_optimization(serial_main_program,
+                                         serial_startup_program, serial_loss,
+                                         params_grads)
+            # Do logical partition
+            partitioner = Partitioner(dist_context, rank)
+            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
+                serial_main_program, serial_startup_program, params_grads)
+            # Generate optimizer
+            self._generate_optimizer(dist_main_prog, dist_startup_prog,
+                                     dist_params_grads)
+            # Do reshard process
+            set_grad_var_shape(dist_main_prog, dist_context)
+            make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
+            reshard(dist_main_prog, dist_startup_prog, rank, dist_context,
+                    dist_params_grads)
+            # Apply post optimization passes
+            self._apply_post_optimization(dist_main_prog, dist_startup_prog,
+                                          rank, dist_params_grads)
+        self._dist_main_progs[self.mode][rank] = dist_main_prog
+        self._dist_startup_progs[self.mode][rank] = dist_startup_prog
+
+    def _generate_backward(self, main_program, startup_program, loss):
+        with program_guard(main_program, startup_program):
+            params_grads = append_backward(
+                loss,
+                distop_context=self._dist_contexts[self.mode].dist_op_context)
+        self._completer.complete_backward_annotation(main_program)
+        self._dist_contexts[self.mode].block_state.parse_backward_blocks(
+            main_program)
+        return params_grads
+
+    def _generate_optimizer(self, main_program, startup_program, params_grads):
+        with program_guard(main_program, startup_program):
+            optimizer_ops = copy.deepcopy(self.optimizer).apply_gradients(
+                params_grads)
+        self._completer.complete_update_annotation(main_program)
+        return optimizer_ops
+
+    def _apply_pre_optimization(self, main_program, startup_program, loss,
+                                params_grads):
+        # apply amp pass
+        if self.strategy.amp:
+            config = copy.deepcopy(self.strategy.amp_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["params_grads"] = params_grads
+            config["loss"] = loss
+            auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
+            auto_parallel_amp_pass.apply([main_program], [startup_program],
+                                         self._pass_contexts[self.mode])
+
+        # apply recompute pass
+        if self.strategy.recompute:
+            config = copy.deepcopy(self.strategy.recompute_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["no_grad_set"] = None
+            config["loss"] = loss
+            auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
+                                                    config)
+            auto_parallel_recompute_pass.apply([main_program],
+                                               [startup_program],
+                                               self._pass_contexts[self.mode])
+
+    def _apply_post_optimization(self, main_program, startup_program, rank,
+                                 params_grads):
+        if self.strategy.sharding:
+            config = copy.deepcopy(self.strategy.sharding_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["params_grads"] = params_grads
+            config["global_rank"] = rank
+            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
+                                                   config)
+            auto_parallel_sharding_pass.apply([main_program],
+                                              [startup_program],
+                                              self._pass_contexts[self.mode])
+
+        if self.strategy.gradient_merge:
+            config = copy.deepcopy(self.strategy.gradient_merge_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["params_grads"] = params_grads
+            auto_parallel_gradient_merge_pass = new_pass(
+                "auto_parallel_gradient_merge_pass", config)
+            auto_parallel_gradient_merge_pass.apply(
+                [main_program], [startup_program],
+                self._pass_contexts[self.mode])
+
+    def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=1000):
+        assert isinstance(train_data, Dataset)
+        assert steps_per_epoch is not None
+        train_dataloader = self._create_dataloader(train_data, batch_size,
+                                                   epochs, steps_per_epoch)
+        self._init_communication()
+        dist_startup_prog = self._dist_startup_progs["train"][self._cur_rank]
+        self._executor.run(dist_startup_prog)
+        for epoch in range(epochs):
+            # train_dataloader.start()
+            # for step in range(steps_per_epoch):
+            #     logs = self.train_step(None)
+            #     self._logger.info(logs)
+            # train_dataloader.reset()
+            for step, data in enumerate(train_dataloader):
+                logs = self._train_step(data)
+                train_logs = {
+                    "train_" + name: val
+                    for name, val in logs.items()
+                }
+                self._logger.info(logs)
+
+    def _train_step(self, data):
+        logs = {}
+        dist_main_prog = self._dist_main_progs["train"][self._cur_rank]
+        if self._loss_var.name not in dist_main_prog.global_block().vars:
+            loss = self._executor.run(dist_main_prog)
+            logs["loss"] = None
+        else:
+            fetch_list = self._loss_var
+            loss = self._executor.run(dist_main_prog, fetch_list=fetch_list)
+            logs["loss"] = loss
+        return logs
+
+    def _create_dataloader(self, dataset, batch_size, epochs, steps_per_epoch):
+        feed_list = self._input_vars + self._label_vars
+        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
+        dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
+        dist_context = self._dist_contexts[self.mode]
+        dist_main_block = dist_main_prog.global_block()
+        op_size = len(dist_main_block.ops)
+        places = paddle.static.cuda_places()
+        with fluid.program_guard(dist_main_prog, dist_startup_prog):
+            dataloader = NonIterableGeneratorLoader(
+                dataset, feed_list, places, batch_size, epochs, steps_per_epoch)
+        new_op_size = len(dist_main_block.ops)
+        for idx in range(new_op_size - 1, op_size - 1, -1):
+            op = dist_main_block.ops[new_op_size - 1]
+            new_op_desc = dist_main_block.desc._prepend_op()
+            new_op_desc.copy_from(op.desc)
+            new_op = Operator(
+                dist_main_block, new_op_desc, type=new_op_desc.type())
+            dist_main_block.ops.insert(0, new_op)
+            dist_op = DistributedOperator(new_op)
+            dist_context.add_dist_op_for_program(dist_op)
+        for _ in range(new_op_size - op_size):
+            dist_main_block._remove_op(new_op_size, sync=False)
+        dist_main_block._sync_with_cpp()
+        return dataloader
+
+    def _init_communication(self):
+        # Traverse different rank programs and traverse each op of them,
+        # instantiate communication by process_mapping.
+        all_process_groups = get_all_process_groups()
+        for process_group in all_process_groups:
+            if self._cur_rank not in process_group.ranks:
+                continue
+            process_group.instantiate()
+
+    # def save(self, path, training=True):
+    #     pass
+
+    # def load(self, path, strict=True, load_optimizer=True):
+    #     pass
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
index 2870acfd367cab5236f8544c447bdd269b8e654b..b887de577b0a21818aa1165c9015fb33a13da037 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
@@ -76,9 +76,9 @@ class DistributedCheckFiniteAndUnscaleImpl(DistributedOperatorImpl):
 
         # by now the backward function only insert the gradient allreduce for dist op itself
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        backward_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.main_block
+        backward_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 48f9b5a78dd8a371962ed4b72babe01dcc1ac5d4..4e977007261a73e9b24a051f84e6e30f2bf9d860 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -32,6 +32,8 @@ from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY,
 from ..process_group import new_process_group
 from ..utils import _get_comm_group, _get_corresponding_rank
 
+__op_not_need_param_init__ = ["while", "cond"]
+
 
 class DistributedDefault(DistributedOperatorImplContainer):
     def __init__(self, op_type):
@@ -195,10 +197,10 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
     def forward(ctx, *args, **kwargs):
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
@@ -227,6 +229,9 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
         main_block._sync_with_cpp()
 
         # param initialization sync
+        if src_op.type in __op_not_need_param_init__:
+            return
+
         for varname in dist_op_desc.input_arg_names():
             if startup_block.has_var(varname) and startup_block.var(
                     varname
@@ -278,12 +283,12 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
 
         # by now the backward function only insert the gradient allreduce for dist op itself
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        backward_op = dist_op_context.get_cur_src_op()
+        main_block = dist_op_context.work_block
+        backward_op = dist_op_context.cur_src_op
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
-        rank_id = dist_op_context.get_rank_id()
+        rank_id = dist_op_context.rank_id
 
         # check validation of inputs / outputs
         for input_name in backward_op.desc.input_names():
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index eac4776f8f3bcdbffc85725a2280b30c6bcff060..94eb0d2d469f0595fdc8cb31821d6cded9ad064a 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -128,10 +128,10 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -311,9 +311,9 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
 
         # by now the backward function only insert the gradient allreduce for dist op itself
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        backward_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        backward_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index cb59a6f25c48769a639094f0a14ac12b63036657..9eb24a65e608c22573342f32dfd0dc96a601e3ac 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -223,9 +223,9 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
     # by now the backward function only insert the gradient allreduce for dist op itself
 
     dist_op_context = ctx.dist_op_context
-    main_block = dist_op_context.get_dst_main_program().global_block()
-    backward_op = dist_op_context.get_cur_src_op()
-    rank_id = dist_op_context.get_rank_id()
+    main_block = dist_op_context.work_block
+    backward_op = dist_op_context.cur_src_op
+    rank_id = dist_op_context.rank_id
     dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
     assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
         str(backward_op))
@@ -257,7 +257,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
         kwargs['Y@GRAD'])
 
     X_var = main_block.var(kwargs['X'][0])
-    Y_var = main_block.var(kwargs['Y'][0])
+    Y_var = main_block._var_recursive(kwargs['Y'][0])
     Out_grad = main_block.var(kwargs['Out@GRAD'][0])
     Y_grad = main_block.var(kwargs['Y@GRAD'][0])
 
@@ -433,7 +433,8 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
 def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
 
-    assert Weight_var.name not in dist_op_context.already_init_sync_vars
+    assert Weight_var.name not in dist_op_context.already_init_sync_vars, "{} is in {}.".format(
+        Weight_var.name, dist_op_context.already_init_sync_vars)
     assert startup_block.has_var(Weight_var.name)
     dist_op_context.already_init_sync_vars.add(Weight_var.name)
     param = startup_block.var(Weight_var.name)
@@ -528,10 +529,10 @@ class DistributedMatmulImpl0(DistributedOperatorImpl):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -753,10 +754,10 @@ class DistributedMatmulImpl1(DistributedOperatorImpl):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -1042,10 +1043,10 @@ class DistributedMatmulV2Impl0(DistributedOperatorImpl):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -1071,7 +1072,7 @@ class DistributedMatmulV2Impl0(DistributedOperatorImpl):
                 output_name)
 
         X_var = main_block.var(kwargs['X'][0])
-        Weight_var = main_block.var(kwargs['Y'][0])
+        Weight_var = main_block._var_recursive(kwargs['Y'][0])
         Out_var = main_block.var(kwargs['Out'][0])
 
         # TODO infer logic comm presentation
@@ -1261,10 +1262,10 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -1290,7 +1291,7 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
                 output_name)
 
         X_var = main_block.var(kwargs['X'][0])
-        Weight_var = main_block.var(kwargs['Y'][0])
+        Weight_var = main_block._var_recursive(kwargs['Y'][0])
         Out_var = main_block.var(kwargs['Out'][0])
 
         # TODO infer logic comm presentation
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
index 93b0d91b7836d64ae6e1dc9b17161746bc6b8444..a72e304bb5b911eb89fd3e401f9a4e9abf58ceb2 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
@@ -130,9 +130,9 @@ class DistributedReshapeImpl0(DistributedOperatorImpl):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -287,9 +287,9 @@ class DistributedReshapeImpl1(DistributedOperatorImpl):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
index f216fce16f30d0d581248402740b27da41725904..4ea2e0a88471601a5c8051c4f58a41c2509bc033 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
@@ -65,9 +65,9 @@ class DistributedUpdateLossScalingImpl(DistributedOperatorImpl):
 
         # the backward function only filte the gradient with current rank id
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        backward_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.main_block
+        backward_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 6278f0a2424a0fa89b5ae7ab2350aeec63a600a7..0f35ccd915f2ab394c0e7316196b9a03b43b9968 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -132,7 +132,7 @@ class AutoParallelizer:
                 distop_context=self._dist_context.dist_op_context)
         self._completer = Completer(self._dist_context)
         self._completer.complete_backward_annotation(main_program)
-
+        self._dist_context.block_state.parse_backward_blocks(main_program)
         return params_grads
 
     def _apply_optimize(self, main_program, startup_program, params_grads):
@@ -174,6 +174,7 @@ class AutoParallelizer:
         serial_main_program = self._main_program.clone()
         serial_startup_program = self._startup_program.clone()
         serial_loss = serial_main_program.global_block().var(self._loss.name)
+
         # generating serial 
         if dist_context is None:
             # Annotation completion
@@ -186,6 +187,9 @@ class AutoParallelizer:
             completed_main_program = serial_main_program
             self._dist_context = copy.deepcopy(dist_context)
 
+        # parse forward sub block
+        self._dist_context.block_state.parse_forward_blocks(serial_main_program)
+
         # serial backward pass
         params_grads = self._generate_backward(
             completed_main_program, serial_startup_program, serial_loss,
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index e789d82632e073544b7efaac96f397bb9df9276c..2f88407c093a534d1d67133aece636127ff29626 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -29,6 +29,9 @@ from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op
 from .operators.common import BACKWARD_ONLY_DIST_OPS
 
 __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
+__not_shape_var_type__ = [
+    core.VarDesc.VarType.READER, core.VarDesc.VarType.STEP_SCOPES
+]
 
 
 class Partitioner(object):
@@ -75,8 +78,8 @@ class Partitioner(object):
 
         # init distop helper
         dist_op_context = self._dist_context.dist_op_context
-        dist_op_context.set_varname_mapping(self._serial2dist_varname_mapping)
-        dist_op_context.set_rank_id(self._rank_id)
+        dist_op_context.varname_mapping = self._serial2dist_varname_mapping
+        dist_op_context.rank_id = self._rank_id
 
         # partition startup program
         if serial_startup_program == None:
@@ -84,7 +87,7 @@ class Partitioner(object):
         else:
             partitioned_startup_prog = self.partition_startup_program(
                 serial_main_program, serial_startup_program)
-        dist_op_context.set_dst_startup_program(partitioned_startup_prog)
+        dist_op_context.dst_startup_program = partitioned_startup_prog
 
         # partition main program
         partitioned_main_prog, partitioned_params_grads = self.partition_main_program(
@@ -157,15 +160,45 @@ class Partitioner(object):
         2. replace local op with corresponding dist op
         """
 
-        dist_op_context = self._dist_context.dist_op_context
         partitioned_main_prog = fluid.Program()
-        dist_op_context.set_dst_main_program(partitioned_main_prog)
-        target_block = partitioned_main_prog.global_block()
-        ref_block = serial_main_program.global_block()
-        serial_ops = serial_main_program.global_block().ops
+        dist_op_context = self._dist_context.dist_op_context
+        dist_op_context.dst_main_program = partitioned_main_prog
+
+        for idx in range(self._dist_context.block_state.nblock):
+            ref_block = serial_main_program.blocks[idx]
+
+            if idx == 0:
+                target_block = partitioned_main_prog.blocks[0]
+            else:
+                target_block = partitioned_main_prog._create_block(
+                    parent_idx=ref_block.parent_idx)
+                assert ref_block.idx == target_block.idx
+                target_block._set_forward_block_idx(ref_block.forward_block_idx)
+            dist_op_context.work_block = target_block
+            self.partition_block(ref_block, target_block)
+
+        partitioned_main_prog.current_block_idx = 0
+
+        partitioned_params_and_grads = []
+        for p, g in params_and_grads:
+            assert p.name in self._serial2dist_varname_mapping
+            dist_p = self._get_dist_var_by_serial_var(p, partitioned_main_prog)
+            if g is None:
+                dist_g = None
+            else:
+                assert g.name in self._serial2dist_varname_mapping
+                dist_g = self._get_dist_var_by_serial_var(g,
+                                                          partitioned_main_prog)
+            partitioned_params_and_grads.append((dist_p, dist_g))
+
+        return partitioned_main_prog, partitioned_params_and_grads
+
+    def partition_block(self, ref_block, target_block):
+
+        dist_op_context = self._dist_context.dist_op_context
+        serial_ops = ref_block.ops
 
         # init mapping
-        first_backward_op_idx = -1
         forward_op_id2forward_op = {}
         for idx in range(len(serial_ops)):
             if is_forward_op(serial_ops[idx]):
@@ -218,23 +251,6 @@ class Partitioner(object):
                     "partitioner only support forward op and backward op, but got {}".
                     format(str(op)))
 
-        partitioned_params_and_grads = []
-        for p, g in params_and_grads:
-            assert p.name in self._serial2dist_varname_mapping
-            dist_p_name = self._serial2dist_varname_mapping[p.name]
-            assert target_block.has_var(dist_p_name)
-            dist_p = target_block.var(dist_p_name)
-            if g is None:
-                dist_g = None
-            else:
-                assert g.name in self._serial2dist_varname_mapping
-                dist_g_name = self._serial2dist_varname_mapping[g.name]
-                assert target_block.has_var(dist_g_name)
-                dist_g = target_block.var(dist_g_name)
-            partitioned_params_and_grads.append((dist_p, dist_g))
-
-        return partitioned_main_prog, partitioned_params_and_grads
-
     def _is_valid_annotated_program(self, program):
 
         # TODO (ZJ-LIANG) should check all block
@@ -245,7 +261,7 @@ class Partitioner(object):
         ]
         var_dist_attrs = [
             self._dist_context.get_tensor_dist_attr_for_program(var)
-            for var in vars_
+            for var in vars_ if (var.type not in __not_shape_var_type__)
         ]
 
         all_ops_annotated = all(dist_attr is not None
@@ -255,6 +271,14 @@ class Partitioner(object):
 
         return all_ops_annotated and all_vars_annotated
 
+    def _get_dist_var_by_serial_var(self, serial_var, partitioned_main_prog):
+
+        block_idx = serial_var.block.idx
+        target_block = partitioned_main_prog.blocks[block_idx]
+        dist_var_name = self._serial2dist_varname_mapping[serial_var.name]
+        assert target_block.has_var(dist_var_name)
+        return target_block.var(dist_var_name)
+
 
 def _get_dist_shape(var, dist_attr):
 
@@ -341,7 +365,7 @@ def _partition_var(dist_context, src_block, dst_block, src_varname,
     """
     src_var = src_block.var(src_varname)
 
-    if src_var.type == core.VarDesc.VarType.READER:
+    if src_var.type in __not_shape_var_type__:
         dst_block.create_var(
             type=src_var.type,
             name=dst_varname,
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 19306d3da9916898059bf597a1577ea5aeb98822..0d985a523251754ff4335d76cd4ced7ef3f42f49 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -156,6 +156,16 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         )
         base_group.add_argument("--selected_npus", dest="npus")
 
+    if fluid.core.is_compiled_with_mlu():
+        base_group.add_argument(
+            "--mlus",
+            type=str,
+            default=None,
+            help="It's for mlu training. For example: "
+            "--mlus=\"0,1,2,3\" will launch four training processes each bound to one mlu."
+        )
+        base_group.add_argument("--selected_mlus", dest="mlus")
+
     base_group.add_argument(
         "training_script",
         type=str,
@@ -429,6 +439,8 @@ def infer_backend(args):
         args.backend = 'unknown'
     elif fluid.core.is_compiled_with_xpu():
         args.backend = 'bkcl'
+    elif fluid.core.is_compiled_with_mlu():
+        args.backend = 'cncl'
     else:
         args.backend = 'gloo'
 
@@ -472,6 +484,8 @@ def which_distributed_mode(args):
         accelerators = fluid.core.get_npu_device_count()
     elif fluid.core.is_compiled_with_xpu():
         accelerators = fluid.core.get_xpu_device_count()
+    elif fluid.core.is_compiled_with_mlu():
+        accelerators = fluid.core.get_mlu_device_count()
     else:
         accelerators = 0
 
@@ -490,17 +504,18 @@ def which_distributed_mode(args):
         return DistributeMode.COLLECTIVE
     else:
         if not fluid.core.is_compiled_with_cuda(
-        ) and not fluid.core.is_compiled_with_xpu():
+        ) and not fluid.core.is_compiled_with_xpu(
+        ) and not fluid.core.is_compiled_with_mlu():
             if args.servers:
                 logger.warning(
-                    "Not found distinct arguments and not compiled with cuda or xpu or npu. "
+                    "Not found distinct arguments and not compiled with cuda or xpu or npu or mlu. "
                     "But found args.servers not empty, default use ps mode")
                 return DistributeMode.PS
             else:
                 return DistributeMode.COLLECTIVE
         else:
             logger.warning(
-                "Not found distinct arguments and compiled with cuda or xpu or npu. "
+                "Not found distinct arguments and compiled with cuda or xpu or npu or mlu. "
                 "Default use collective mode")
             return DistributeMode.COLLECTIVE
 
@@ -536,6 +551,10 @@ def launch():
         
         - ``--selected_xpus``: xpus aliases, recommend to use ``--xpus``.
 
+        - ``--mlus``: It's for mlu training. e.g., ``--mlus=0,1,2,3`` will launch four training processes each bound to one mlu.
+
+        - ``--selected_mlus``: mlus aliases, recommend to use ``--mlus``.
+
         - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
 
         - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
@@ -688,7 +707,7 @@ def launch():
         check_backend(args.backend)
         distribute_mode = DistributeMode.COLLECTIVE
 
-    #assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter', 'unknown']
+    #assert args.backend in ['gloo', 'nccl', 'bkcl', 'cncl', 'heter', 'unknown']
 
     if args.backend == 'gloo':
         logger.warning("launch start with CPUONLY mode")
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index c20c209d6017145981aec23ca14e526a47845c59..2dec58c75385320e936b62f50e55b23c2f180485 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -57,6 +57,7 @@ class DeviceMode():
     XPU = 2
     ASCEND_NPU = 3
     UNKNOWN = 3
+    MLU = 4
 
 
 class Cluster(object):
@@ -287,7 +288,7 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
         ), "current trainer_endpoints size should be greater equal than acclerators size."
         for i in range(len(devices_per_proc)):
             trainer = Trainer()
-            if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU:
+            if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU or device_mode == DeviceMode.MLU:
                 if isinstance(devices_per_proc[i], (list, tuple)):
                     trainer.accelerators.extend(devices_per_proc[i])
                     pod.accelerators.extend(devices_per_proc[i])
@@ -530,6 +531,9 @@ def start_local_trainers(cluster,
                  accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU:
             proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators])
+        elif len(t.accelerators) > 0 and pod.device_mode == DeviceMode.MLU:
+            proc_env["FLAGS_selected_mlus"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
 
         if len(t.accelerators) > 0:
             proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
@@ -735,6 +739,35 @@ def get_npus(npus):
     return res_npus
 
 
+def get_mlus(mlus):
+    if mlus is None:
+        mlus_num = fluid.core.get_mlu_device_count()
+        res_mlus = [str(x) for x in range(0, mlus_num)]
+    else:
+        mlu_visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+        if mlu_visible_devices is None or mlu_visible_devices == "":
+            res_mlus = [x.strip() for x in mlus.split(',')]
+        else:
+            # change mlus into relative values
+            # e.g. MLU_VISIBLE_DEVICES=4,5,6,7; args.mlus=4,5,6,7;
+            # therefore mlus=0,1,2,3
+            mlu_visible_devices_list = mlu_visible_devices.split(',')
+            for x in mlus.split(','):
+                assert x in mlu_visible_devices_list, "Can't find "\
+                    "your mlus %s in MLU_VISIBLE_DEVICES[%s]."\
+                    % (x, mlu_visible_devices)
+            res_mlus = [
+                mlu_visible_devices_list.index(x.strip())
+                for x in mlus.split(',')
+            ]
+            logger.info("Change selected_mlus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "MLU_VISIBLE_DEVICES:{}".format(
+                            mlus, res_mlus, mlu_visible_devices_list))
+
+    return res_mlus
+
+
 def get_device_mode(backend):
     if backend == 'heter':
         if fluid.core.is_compiled_with_cuda() and \
@@ -763,6 +796,10 @@ def get_device_mode(backend):
         print("launch train in XPU mode")
         return DeviceMode.XPU
 
+    if backend == 'cncl' and fluid.core.get_mlu_device_count() > 0:
+        print("launch train in MLU mode")
+        return DeviceMode.MLU
+
     if backend == 'gloo':
         print("launch train in CPU mode")
         return DeviceMode.CPU
@@ -812,6 +849,18 @@ def get_device_proc_info(args):
             ]
         else:
             devices_per_proc = xpus
+    elif device_mode == DeviceMode.MLU:
+        mlus = get_mlus(args.mlus)
+        if args.nproc_per_node is not None:
+            assert (len(mlus) % int(args.nproc_per_node)) ==0, \
+                "mlus' number:{} mod args.nproc_per_node:{} must == 0".format(len(mlus), args.nproc_per_node)
+
+            n = int(len(mlus) / int(args.nproc_per_node))
+            devices_per_proc = [
+                mlus[i:i + n] for i in six.moves.range(0, len(mlus), n)
+            ]
+        else:
+            devices_per_proc = mlus
     elif device_mode == DeviceMode.CPU:
         if hasattr(args, "paddle_cpuonly") and args.nproc_per_node is None:
             #NOTE (xiongkun03) set it to cpu core number
@@ -1719,7 +1768,7 @@ class ParameterServerLauncher(object):
 
 
 def check_backend(backend):
-    if backend not in ['nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter']:
+    if backend not in ['nccl', 'gloo', 'bkcl', 'cncl', 'auto', 'hccl', 'heter']:
         raise ValueError("paddle.distributed initialize error, "
                          "backend argument can only be one of "
                          "'nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter' "
@@ -1743,6 +1792,12 @@ def check_backend(backend):
             "your paddle is not compiled with npu but you assign 'hccl' as backend."
         )
 
+    if backend == 'cncl' and not fluid.core.is_compiled_with_mlu():
+        raise ValueError(
+            "paddle.distributed initialize error, "
+            "your paddle is not compiled with mlu but you assign 'cncl' as backend."
+        )
+
 
 def block_windows_and_macos(backend):
     if backend != 'gloo': return
@@ -1766,4 +1821,7 @@ def get_backend_by_compile_flag():
     if fluid.core.is_compiled_with_npu():
         return 'hccl'
 
+    if fluid.core.is_compiled_with_mlu():
+        return 'cncl'
+
     return 'gloo'
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index bc50bef010941a48c367046221d17c75138753c2..100a6882b1b35412ff1286b2363c023d4b8b9770 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -46,7 +46,9 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         attrs['loss'] = loss
         attrs['min_block_size'] = 81920
         attrs['origin_main_program'] = loss.block.program
+        attrs['origin_main_programs'] = [loss.block.program]
         attrs['origin_startup_program'] = startup_program
+        attrs['origin_startup_programs'] = [startup_program]
 
         attrs['cloned_main'] = attrs['origin_main_program'].clone()
         attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 3f39db69abdb2930ec40ffb02cb34dce7be6a034..284365ce06651ae6ca8e9ff5ad27fd92d33e8dae 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -560,9 +560,9 @@ class FakeInitOpsPass(PassBase):
         return True
 
     def _get_sparse_table_names(self, attrs):
-        dist_varnames = get_sparse_tablenames(attrs['origin_main_program'],
+        dist_varnames = get_sparse_tablenames(attrs['origin_main_programs'],
                                               True)
-        sparse_varnames = get_sparse_tablenames(attrs['origin_main_program'],
+        sparse_varnames = get_sparse_tablenames(attrs['origin_main_programs'],
                                                 False)
         return list(set(dist_varnames + sparse_varnames))
 
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index f842ca791f1e5f7a1de259432b3b60b852b869b1..14a68ad91674762e8c786cfc55d7032edb62f7f4 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -24,8 +24,8 @@ from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor
 from paddle.fluid.framework import Variable, Parameter
-from .runtime_base import RuntimeBase
-from ..base.private_helper_function import wait_server_ready
+from paddle.distributed.fleet.runtime.runtime_base import RuntimeBase
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
 from paddle.fluid.communicator import Communicator, HeterClient
 from google.protobuf import text_format
 
@@ -39,8 +39,17 @@ def conv_indent(indent):
 PSERVER_SAVE_SUFFIX = ".shard"
 
 
-def parse_table_class(varname, o_main_program):
-    for op in o_main_program.global_block().ops:
+def get_program_by_id(context, program_id):
+    programs = context["origin_main_programs"]
+    for i, program in enumerate(programs):
+        if id(program) == program_id:
+            return program, context["origin_startup_programs"][i]
+    return None, None
+
+
+def parse_table_class(varname, program_id, context):
+    main_program, startup_program = get_program_by_id(context, program_id)
+    for op in main_program.global_block().ops:
         if not is_distributed_sparse_op(op) and not is_sparse_op(op):
             continue
 
@@ -53,9 +62,10 @@ def parse_table_class(varname, o_main_program):
                 return "MemorySparseTable"
 
 
-def get_default_accessor_proto(accessor, varname, o_main_program):
+def get_default_accessor_proto(accessor, varname, program_id, context):
+    main_program, startup_program = get_program_by_id(context, program_id)
     embedding_dim = 0
-    for var in o_main_program.list_vars():
+    for var in main_program.list_vars():
         if var.name == varname:
             embedding_dim = var.shape[1]
             break
@@ -123,9 +133,10 @@ def get_default_accessor_proto(accessor, varname, o_main_program):
                 sgd_param.adam.weight_bounds.extend([-10.0, 10.0])
 
 
-def check_embedding_dim(accessor, varname, o_main_program):
+def check_embedding_dim(accessor, varname, program_id, context):
+    main_program, startup_program = get_program_by_id(context, program_id)
     embedding_dim = 0
-    for var in o_main_program.list_vars():
+    for var in main_program.list_vars():
         if var.name == varname:
             embedding_dim = var.shape[1]
             break
@@ -172,6 +183,8 @@ class CommonAccessor:
         self.dims = []
         self.trainer_num = 0
         self.sync = "false"
+        self.table_num = None
+        self.table_dim = None
         self.initializers = []
         self.opt_input_map = {}
         self.opt_attr_map = {}
@@ -192,6 +205,7 @@ class CommonAccessor:
         opt_input_map["sum"] = [("Param", None)]
         opt_input_map["naive_adagrad"] = [("Param", None), ("G2Sum", 1),
                                           ("LearningRate", 1)]
+        opt_input_map["summary"] = [("Param", None), ("SummaryDecayRate", 1)]
 
         opt_attr_map = {}
         opt_attr_map["sgd"] = []
@@ -201,6 +215,7 @@ class CommonAccessor:
                                 ("epsilon", "f")]
         opt_attr_map["adam_d2sum"] = [("beta1", "f"), ("beta2", "f"),
                                       ("epsilon", "f")]
+        opt_attr_map["summary"] = []
 
         opt_init_map = {}
         opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
@@ -212,8 +227,9 @@ class CommonAccessor:
         self.opt_input_map = opt_input_map
         self.opt_init_map = opt_init_map
 
-    def parse_entry(self, varname, o_main_program):
-        for op in o_main_program.global_block().ops:
+    def parse_entry(self, varname, program_id, context):
+        main_program, startup_program = get_program_by_id(context, program_id)
+        for op in main_program.global_block().ops:
             if not is_distributed_sparse_op(op) and not is_sparse_op(op):
                 continue
 
@@ -243,23 +259,36 @@ class CommonAccessor:
         attr_str = ""
 
         origin_var_name = value_name
+        print("get_initializer_attr param name:", value_name)
         for op in o_startup_program.global_block().ops:
             if op.type in self.opt_init_map.keys(
             ) and origin_var_name == op.output("Out")[0]:
                 init_attr = [op.type]
+                print("get_initializer_attr op type:", op.type)
                 for attr in self.opt_init_map[op.type]:
+                    print("get_initializer_attr opt_init_map attr:", attr)
                     init_attr.append(str(op.attr(attr)))
+                    print("get_initializer_attr op attr:", str(op.attr(attr)))
                 attr_str = l_in.join(init_attr)
                 break
         return attr_str
 
-    def parse_by_optimizer(self, grad_name, is_sparse, total_dims, context,
-                           adam_d2sum):
-        main_program = context['origin_main_program']
-        startup_program = context['startup_main_program']
+    def parse_by_optimizer(self, ctx, context):
+        grad_name = ctx.origin_varnames()[0]
+        is_sparse = ctx.is_sparse()
+        size = ctx.sections()[0]
+        single_dim = ctx.sections()[1] if ctx.is_sparse() else 1
+        adam_d2sum = context["user_defined_strategy"].adam_d2sum
+        print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
+            ctx.table_id(), ctx.is_datanorm_table()))
+
+        main_program, startup_program = get_program_by_id(context,
+                                                          ctx.program_id())
         pserver_id = get_role_id(context['role_maker'])
         pserver_num = len(get_ps_endpoints(context['role_maker']))
         optimizer_ops = get_optimize_ops(main_program)
+        print("the one ps optimizer_ops:", optimizer_ops)
+        print("the one ps parse_by_optimizer grad_name:", grad_name)
         oop = None
 
         for op in optimizer_ops:
@@ -278,6 +307,8 @@ class CommonAccessor:
         initializers = []
 
         self.trainer_num = get_trainers(context['role_maker'])
+        self.table_num = size
+        self.table_dim = single_dim
 
         if oop.type != 'adam' and adam_d2sum == True:
             print('optimization algorithm is not adam, set adam_d2sum False')
@@ -291,7 +322,11 @@ class CommonAccessor:
             param_varnames = self.opt_input_map["naive_adagrad"]
             attr_varnames = self.opt_attr_map["naive_adagrad"]
             self.accessor_class = "sgd"
-        elif adam_d2sum:
+        elif ctx.is_datanorm_table():
+            param_varnames = self.opt_input_map["summary"]
+            attr_varnames = self.opt_attr_map["summary"]
+            self.accessor_class = "summary"
+        elif adam_d2sum and not is_sparse:
             param_varnames = self.opt_input_map["adam_d2sum"]
             attr_varnames = self.opt_attr_map["adam_d2sum"]
             self.accessor_class = "adam_d2sum"
@@ -306,10 +341,9 @@ class CommonAccessor:
                 #for dims
                 if shape is None:
                     if is_sparse:
-                        shape = total_dims
+                        shape = single_dim
                     else:
-                        shape = self.get_shard(total_dims, pserver_num,
-                                               pserver_id)
+                        shape = self.get_shard(size, pserver_num, pserver_id)
                 dims.append(shape)
 
                 #for initializers
@@ -333,6 +367,27 @@ class CommonAccessor:
                 else:
                     initializer = "fill_constant&0"
                 initializers.append(initializer)
+            elif self.accessor_class == "summary":
+                #for dims
+                if shape is None:
+                    if is_sparse:
+                        shape = single_dim
+                    else:
+                        shape = self.get_shard(size, pserver_num, pserver_id)
+                dims.append(shape)
+
+                #for initializers
+                if formal_name == "Param":
+                    param = main_program.global_block().vars[oop.input(
+                        formal_name)[0]]
+
+                    initializer = self.get_initializer_attr(param.name,
+                                                            startup_program)
+                elif formal_name == "SummaryDecayRate":
+                    initializer = "fill_constant&0.99999"
+                else:
+                    initializer = "fill_constant&0"
+                initializers.append(initializer)
             else:
                 if formal_name == "G2Sum":
                     dims.append(1)
@@ -348,9 +403,9 @@ class CommonAccessor:
 
                     if shape is None:
                         if is_sparse:
-                            shape = total_dims
+                            shape = single_dim
                         else:
-                            shape = self.get_shard(total_dims, pserver_num,
+                            shape = self.get_shard(size, pserver_num,
                                                    pserver_id)
                     dims.append(shape)
 
@@ -379,6 +434,10 @@ class CommonAccessor:
             attrs += "entry: \"{}\" ".format(self.entry)
         attrs += "trainer_num: {} ".format(self.trainer_num)
         attrs += "sync: {} ".format(self.sync)
+        if self.table_num:
+            attrs += "table_num: {} ".format(self.table_num)
+        if self.table_dim:
+            attrs += "table_dim: {} ".format(self.table_dim)
 
         for param in self.params:
             attrs += "params: \"{}\" ".format(param)
@@ -448,10 +507,7 @@ class Table:
             accessor_str = accessor_str.format(
                 conv_indent(indent), self.accessor_proto, conv_indent(indent))
             attrs += accessor_str + "\n"
-            return table_str.format(
-                conv_indent(indent), attrs, conv_indent(indent))
-
-        if self.accessor is not None:
+        elif self.accessor is not None:
             attrs += self.accessor.to_string(indent)
             attrs += "\n"
 
@@ -607,7 +663,9 @@ class TheOnePSRuntime(RuntimeBase):
     def _set_basic_info(self, context):
         self.context = context
         self.role_maker = context["role_maker"]
+
         self.origin_main_program = context["origin_main_program"]
+        self.origin_main_programs = context["origin_main_programs"]
 
         self.context[
             'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode
@@ -615,10 +673,13 @@ class TheOnePSRuntime(RuntimeBase):
         self.context['trainer'] = TrainerRuntimeConfig(context[
             'valid_strategy'])
         self.context['ps_mode'] = self.context['trainer'].mode
-        self.context['use_ps_gpu'] = context['valid_strategy'].use_ps_gpu
+        self.context['use_ps_gpu'] = context['valid_strategy'].a_sync_configs[
+            'use_ps_gpu']
         self.is_sync = True if self.context[
             'ps_mode'] == DistributedMode.SYNC else False
         self.context['grad_name_to_param_name'] = {}
+        self.context['tensor_table'] = {}
+        build_var_distributed(self.context)
 
     def _init_worker(self):
         worker = self._get_fleet_proto(is_server=False, is_sync=self.is_sync)
@@ -689,6 +750,7 @@ class TheOnePSRuntime(RuntimeBase):
             sync_kwargs = sync_strategy_envs()
             kwargs.update(sync_kwargs)
 
+        print("communicator config:", trainer_config.get_communicator_flags())
         self._communicator = Communicator(
             trainer_config.mode, kwargs,
             trainer_config.get_communicator_flags())
@@ -893,7 +955,7 @@ class TheOnePSRuntime(RuntimeBase):
                     common.table_name = self.context['grad_name_to_param_name'][
                         ctx.origin_varnames()[0]]
 
-                    if self.ps_mode == DistributedMode.GEO:
+                    if self.context['ps_mode'] == DistributedMode.GEO:
                         table.table_class = "SparseGeoTable"
                     else:
                         all_table_proto = self.context[
@@ -907,7 +969,8 @@ class TheOnePSRuntime(RuntimeBase):
                             table.table_class = table_proto.table_class
                         else:
                             table.table_class = parse_table_class(
-                                common.table_name, self.origin_main_program)
+                                common.table_name,
+                                ctx.program_id(), self.context)
                         if table.table_class != 'MemorySparseTable':
                             table.table_class = 'MemorySparseTable'
                             warnings.warn(
@@ -925,12 +988,12 @@ class TheOnePSRuntime(RuntimeBase):
                             warnings.warn(
                                 "The accessor of sparse table is not set, use default value."
                             )
-                        get_default_accessor_proto(table_proto.accessor,
-                                                   common.table_name,
-                                                   self.origin_main_program)
+                        get_default_accessor_proto(
+                            table_proto.accessor, common.table_name,
+                            ctx.program_id(), self.context)
                         check_embedding_dim(table_proto.accessor,
                                             common.table_name,
-                                            self.origin_main_program)
+                                            ctx.program_id(), self.context)
                         table.accessor_proto = text_format.MessageToString(
                             table_proto.accessor)
                 else:
@@ -940,15 +1003,11 @@ class TheOnePSRuntime(RuntimeBase):
                     common.table_name = "MergedDense"
 
                 adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
-                common.parse_by_optimizer(ctx.origin_varnames()[0],
-                                          ctx.is_sparse(),
-                                          ctx.sections()[1] if ctx.is_sparse()
-                                          else ctx.sections()[0], self.context,
-                                          adam_d2sum)
+                common.parse_by_optimizer(ctx, self.context)
 
                 if ctx.is_sparse():
                     common.parse_entry(common.table_name,
-                                       self.origin_main_program)
+                                       ctx.program_id(), self.context)
 
                 if is_sync:
                     common.sync = "true"
@@ -1023,8 +1082,9 @@ class TheOnePSRuntime(RuntimeBase):
         self._server.init_server(proto_txt, string_hosts, role_id, trainers,
                                  self._server_sub_program)
 
-        dist_varnames = get_sparse_tablenames(self.origin_main_program, True)
-        sparse_varnames = get_sparse_tablenames(self.origin_main_program, False)
+        dist_varnames = get_sparse_tablenames(self.origin_main_programs, True)
+        sparse_varnames = get_sparse_tablenames(self.origin_main_programs,
+                                                False)
 
         distributed_varnames = dist_varnames + sparse_varnames
 
@@ -1070,6 +1130,7 @@ class TheOnePSRuntime(RuntimeBase):
             if var.name in exclude_var_names:
                 return False
 
+            from .utils.public import _get_varname_parts
             origin_varname, _, _ = _get_varname_parts(var.name)
             if origin_varname.endswith("@GRAD"):
                 return False
@@ -1085,16 +1146,24 @@ class TheOnePSRuntime(RuntimeBase):
 
         return is_valid
 
+    def _get_inference_model_path(self, dirname):
+        if dirname.startswith("afs:") or dirname.startswith("hdfs:"):
+            model_path = "./dnn_plugin"
+        else:
+            model_path = os.path.join(dirname, "dnn_plugin")
+        return model_path
+
     def _save_sparse_params(self, executor, dirname, context, main_program,
                             mode):
-        distributed_varnames = get_sparse_tablenames(
-            self.context['origin_main_program'], True)
+        distributed_varnames = get_sparse_tablenames(self.origin_main_programs,
+                                                     True)
         values = []
+        model_path = self._get_inference_model_path(dirname)
         for id, names in context.items():
             if names[0] not in distributed_varnames:
                 # only save sparse param to local
                 try:
-                    self._worker.recv_and_save_model(id, dirname)
+                    self._worker.recv_and_save_model(id, model_path)
                 except:
                     pass
             # save sparse & distributed param on server
@@ -1221,10 +1290,7 @@ class TheOnePSRuntime(RuntimeBase):
 
         infer_program._copy_dist_param_info_from(program)
 
-        if dirname.startswith("afs:") or dirname.startswith("hdfs:"):
-            model_path = "./dnn_plugin"
-        else:
-            model_path = os.path.join(dirname, "dnn_plugin")
+        model_path = self._get_inference_model_path(dirname)
         model_basename = "__model__"
         model_basename = os.path.join(model_path, model_basename)
         paddle.save(infer_program, model_basename)
@@ -1266,7 +1332,7 @@ class TheOnePSRuntime(RuntimeBase):
         self._ps_inference_save_persistables(*args, **kwargs)
 
     def _load_sparse_params(self, dirname, context, main_program, mode):
-        distributed_varnames = get_sparse_tablenames(self.origin_main_program,
+        distributed_varnames = get_sparse_tablenames(self.origin_main_programs,
                                                      True)
         values = []
         for id, names in context.items():
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index c6afd0cb03bf3f8d164082d9cbadf8dd7c08254f..25e4dc28bdcb8fbc7df356cca7e6f6aa8b721623 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -79,7 +79,7 @@ class GeoPsProgramBuilder(PsProgramBuilder):  # 仅 CPU 模式
         super(GeoPsProgramBuilder, self).__init__(pass_ctx)
         if self.ps_mode != DistributedMode.GEO:
             raise ValueError("ps mode: {} not matched {}",
-                             format(ps_mode, "GeoPsProgramBuilder"))
+                             format(self.ps_mode, "GeoPsProgramBuilder"))
 
     def _build_trainer_programs(self):
         append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs)
@@ -97,9 +97,9 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
     def __init__(self, pass_ctx):
         logger.info("start building cpu-sync-ps program")
         super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
-        if self.ps_mode != DistributedMode.SYNC:
+        if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC:
             raise ValueError("ps mode: {} not matched {}",
-                             format(ps_mode, "CpuSyncPsProgramBuilder"))
+                             format(self.ps_mode, "CpuSyncPsProgramBuilder"))
 
     def _build_trainer_programs(self):
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
@@ -178,7 +178,7 @@ class HeterAsyncPsProgramBuilder(PsProgramBuilder):
         if self.use_ps_gpu or self.ps_mode == DistributedMode.GEO or self.attrs[
                 'is_heter_ps_mode'] == False:
             raise ValueError("ps mode: {} not matched {}",
-                             format(ps_mode, "HeterAsyncPsProgramBuilder"))
+                             format(self.ps_mode, "HeterAsyncPsProgramBuilder"))
 
     def _build_trainer_programs(self):
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index 7743db1057dd66e7467efee0cc0253c083ff335c..ebec6900e38f5f7ee402513211eeebb13bb6aeb4 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -54,6 +54,9 @@ SPARSE_GRAD_OP_TYPE_DICT = {
 }
 DEFAULT_DEVICE = 'cpu'
 
+DATA_NORM_NAME = [".batch_size", ".batch_sum", ".batch_square_sum"]
+DATA_NORM_GRAD_NAME = [x + "@GRAD" for x in DATA_NORM_NAME]
+
 
 def logger_config(log_path, logging_name):
     logger = logging.getLogger(logging_name)
@@ -84,6 +87,8 @@ class DistributedMode:
 class TrainerRuntimeConfig(object):
     def __init__(self, valid_strategy):
         self.mode = None
+        num_threads = os.getenv("CPU_NUM", "1")
+        send_queue_size = num_threads
         k_steps = valid_strategy.a_sync_configs["k_steps"]
         logger.info("ps mode in strategy: {}, {}".format(
             valid_strategy.a_sync, valid_strategy.a_sync_configs["k_steps"]))
@@ -95,14 +100,13 @@ class TrainerRuntimeConfig(object):
 
         if valid_strategy.a_sync and k_steps > 0:
             self.mode = DistributedMode.GEO
-
-        num_threads = os.getenv("CPU_NUM", "1")
+            send_queue_size = k_steps
 
         self.runtime_configs = {}
         self.runtime_configs['communicator_max_merge_var_num'] = os.getenv(
-            "FLAGS_communicator_max_merge_var_num", num_threads)
+            "FLAGS_communicator_max_merge_var_num", send_queue_size)
         self.runtime_configs['communicator_send_queue_size'] = os.getenv(
-            "FLAGS_communicator_send_queue_size", num_threads)
+            "FLAGS_communicator_send_queue_size", send_queue_size)
         self.runtime_configs[
             'communicator_independent_recv_thread'] = os.getenv(
                 "FLAGS_communicator_independent_recv_thread", "1")
@@ -116,6 +120,55 @@ class TrainerRuntimeConfig(object):
         self.runtime_configs['communicator_is_sgd_optimizer'] = os.getenv(
             "FLAGS_communicator_is_sgd_optimizer", "1")
 
+    def get_communicator_flags(self):
+        need_keys = []
+        num_threads = os.getenv("CPU_NUM", "1")
+        mode_str = ""
+        if self.mode is None or self.mode == DistributedMode.ASYNC:
+            need_keys = self.runtime_configs.keys()
+            mode_str = "async"
+        elif self.mode == DistributedMode.SYNC or self.mode == DistributedMode.HALF_ASYNC:
+            mode_str = "sync or half_async"
+            need_keys = [
+                'communicator_max_merge_var_num',
+                'communicator_send_wait_times', 'communicator_thread_pool_size',
+                'communicator_send_queue_size'
+            ]
+        elif self.mode == DistributedMode.GEO:
+            mode_str = "GEO"
+            need_keys = [
+                'communicator_thread_pool_size', 'communicator_send_wait_times',
+                'communicator_max_merge_var_num', 'communicator_send_queue_size'
+            ]
+        else:
+            raise ValueError("Unsupported Mode")
+
+        if self.mode == DistributedMode.SYNC or self.mode == DistributedMode.HALF_ASYNC:
+            max_merge_var_num = self.runtime_configs[
+                'communicator_max_merge_var_num']
+            send_queue_size = self.runtime_configs[
+                'communicator_send_queue_size']
+            if max_merge_var_num != num_threads:
+                print('WARNING: In {} mode, communicator_max_merge_var_num '
+                      'must be equal to CPU_NUM. But received, '
+                      'communicator_max_merge_var_num = {}, CPU_NUM = '
+                      '{}. communicator_max_merge_var_num will be forced to {}.'
+                      .format(mode_str, max_merge_var_num, num_threads,
+                              num_threads))
+                self.runtime_configs[
+                    'communicator_max_merge_var_num'] = num_threads
+            if send_queue_size != num_threads:
+                print('WARNING: In {} mode, communicator_send_queue_size '
+                      'must be equal to CPU_NUM. But received, '
+                      'communicator_send_queue_size = {}, CPU_NUM = '
+                      '{}. communicator_send_queue_size will be forced to {}.'
+                      .format(mode_str, send_queue_size, num_threads,
+                              num_threads))
+                self.runtime_configs[
+                    'communicator_send_queue_size'] = num_threads
+
+        return dict((key, str(self.runtime_configs[key])) for key in need_keys)
+
 
 def get_lr_ops(program):
     lr_ops = []
@@ -176,6 +229,13 @@ def get_ps_endpoint(role_maker):
         return role_maker.get_pserver_endpoints()[get_role_id(role_maker)]
 
 
+def get_ps_endpoints(role_maker):
+    try:
+        return role_maker._get_pserver_endpoints()
+    except Exception:
+        return role_maker.get_pserver_endpoints()
+
+
 def get_heter_worker_endpoint(role_maker):
     try:
         return role_maker._get_heter_worker_endpoint()
@@ -224,26 +284,20 @@ def is_sparse_op(op):
     return False
 
 
-def get_sparse_tablenames(program, is_distributed):
+def get_sparse_tablenames(programs, is_distributed):
     tablenames = set()
-    if is_distributed:
-        for op in program.global_block().ops:
-            if is_distributed_sparse_op(op):
-                tablenames.add(get_sparse_tablename(op))
-    else:
-        for op in program.global_block().ops:
-            if is_sparse_op(op):
-                tablenames.add(get_sparse_tablename(op))
+    for program in programs:
+        if is_distributed:
+            for op in program.global_block().ops:
+                if is_distributed_sparse_op(op):
+                    tablenames.add(get_sparse_tablename(op))
+        else:
+            for op in program.global_block().ops:
+                if is_sparse_op(op):
+                    tablenames.add(get_sparse_tablename(op))
     return list(tablenames)
 
 
-def get_ps_endpoints(role_maker):
-    try:
-        return role_maker._get_pserver_endpoints()
-    except Exception:
-        return role_maker.get_pserver_endpoints()
-
-
 def get_trainers(role_maker):
     try:
         return role_maker._worker_num()
@@ -251,7 +305,7 @@ def get_trainers(role_maker):
         return role_maker.worker_num()
 
 
-def get_dense_send_context(context,
+def get_dense_send_context(program,
                            send_ctx,
                            idx,
                            merged_dense_pairs,
@@ -260,34 +314,72 @@ def get_dense_send_context(context,
     if len(merged_dense_pairs) < 1:
         return idx
     if not split_dense_table:
+        dense_pairs = []
+        data_norm_pairs = []
+        for merged in merged_dense_pairs:
+            is_data_norm = False
+            grad = merged[1]
+            varname = grad.merged_var.name
+            for name in DATA_NORM_GRAD_NAME:
+                if varname.endswith(name):
+                    is_data_norm = True
+            if is_data_norm:
+                data_norm_pairs.append(merged)
+            else:
+                dense_pairs.append(merged)
+
+        # simple dense table
         origin_varnames = []
         var_numel = 0
-        for merged in merged_dense_pairs:
+        for merged in dense_pairs:
             grad = merged[1]
             origin_varnames.append(grad.merged_var.name)
-            var = context['origin_main_program'].global_block().vars[
-                grad.merged_var.name]
+            var = program.global_block().vars[grad.merged_var.name]
             var_numel += reduce(lambda x, y: x * y, var.shape)
-        grad_name = "Dense@Grad"
-        trainer_id = get_role_id(context['role_maker'])
+        grad_name = "Dense@GRAD_" + str(idx)
         aggregate = True
+        print("public get_dense_send_context dense_table:", grad_name,
+              var_numel, origin_varnames)
         dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
                                 [var_numel], origin_varnames, trainer_id,
-                                aggregate, False, False, idx, False)
+                                aggregate, False, False, idx, False, False,
+                                id(program))
         send_ctx[grad_name] = dense_ctx
         idx += 1
+
+        if len(data_norm_pairs) <= 0:
+            return idx
+
+        # data norm table
+        origin_varnames = []
+        var_numel = 0
+        for merged in data_norm_pairs:
+            grad = merged[1]
+            origin_varnames.append(grad.merged_var.name)
+            var = program.global_block().vars[grad.merged_var.name]
+            var_numel += reduce(lambda x, y: x * y, var.shape)
+        grad_name = "DataNorm@GRAD_" + str(idx)
+        aggregate = True
+        print("public get_dense_send_context data_norm table:", grad_name,
+              var_numel, origin_varnames)
+        data_norm_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
+                                    [var_numel], origin_varnames, trainer_id,
+                                    aggregate, False, False, idx, False, True,
+                                    id(program))
+        send_ctx[grad_name] = data_norm_ctx
+        idx += 1
     else:
         for merged in merged_dense_pairs:
             grad = merged[1]
             origin_varname = grad.merged_var.name
-            var = context['origin_main_program'].global_block().vars[
-                origin_varname]
+            var = program.global_block().vars[origin_varname]
             var_numel = reduce(lambda x, y: x * y, var.shape)
             grad_name = origin_varname
             aggregate = True
             dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
                                     [var_numel], [origin_varname], trainer_id,
-                                    aggregate, False, False, idx, False)
+                                    aggregate, False, False, idx, False, False,
+                                    id(program))
             send_ctx[grad_name] = dense_ctx
             idx += 1
     return idx
@@ -299,25 +391,28 @@ def get_geo_trainer_send_context(context):
                          format(ps_mode, "get_geo_trainer_send_context"))
     send_ctx = {}
     trainer_id = get_role_id(context['role_maker'])
+    origin_programs = context['origin_main_programs']
     idx = 0
 
-    distibuted_varnames = get_sparse_tablenames(context['origin_main_program'],
-                                                True)
-    for merged in context['merged_sparse_pairs']:
-        param, grad = merged
-        grad_name = grad.merged_var.name
-        param_name = param.merged_var.name
-        is_distributed = True if param_name in distibuted_varnames else False
-
-        var = context['origin_main_program'].global_block().vars[
-            grad.merged_var.name]
-        var_numel = reduce(lambda x, y: x * y, var.shape[1:])
-
-        sparse_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
-                                 [var_numel], [grad_name], trainer_id, True,
-                                 True, is_distributed, idx, False)
-        idx += 1
-        send_ctx[sparse_ctx.var_name()] = sparse_ctx
+    distibuted_varnames = get_sparse_tablenames(origin_programs, True)
+    for i, program in enumerate(origin_programs):
+        merged_sparse_pairs = context['merged_sparse_pairs'][i]
+        for merged in merged_sparse_pairs:
+            param, grad = merged
+            grad_name = grad.merged_var.name
+            param_name = param.merged_var.name
+            is_distributed = True if param_name in distibuted_varnames else False
+
+            var = program.global_block().vars[grad.merged_var.name]
+            var_numel = reduce(lambda x, y: x * y, var.shape[1:])
+
+            sparse_ctx = CommContext(grad_name, [grad_name],
+                                     ["127.0.0.1:6071"], [var_numel],
+                                     [grad_name], trainer_id, True, True,
+                                     is_distributed, idx, False, False,
+                                     id(program))
+            idx += 1
+            send_ctx[sparse_ctx.var_name()] = sparse_ctx
 
     if len(send_ctx) == 0:
         raise ValueError("GeoSGD require sparse parameters in your net.")
@@ -336,7 +431,7 @@ def _step_ctx(idx, role_maker):
     sections = [1] * len(endpoints)
     names = [name] * len(endpoints)
     ctx = CommContext(name, names, endpoints, sections, [name], trainer_id,
-                      True, False, False, idx, True)
+                      True, False, False, idx, True, False, -1)
     return name, ctx
 
 
@@ -348,36 +443,45 @@ def get_the_one_send_context(context,
         ep_list = ["127.0.0.1:6071"]
     send_ctx = {}
     trainer_id = get_role_id(context['role_maker'])
+    origin_programs = context['origin_main_programs']
 
     idx = 0
-    idx += get_dense_send_context(context, send_ctx, idx,
-                                  context['merged_dense_pairs'], trainer_id,
-                                  split_dense_table)
-    distibuted_varnames = get_sparse_tablenames(context['origin_main_program'],
-                                                True)
-    for merged in context['merged_sparse_pairs']:
-        param, grad = merged
-        grad_name = grad.merged_var.name
-        param_name = param.merged_var.name
-        splited_varname = []
-
-        for i in range(len(ep_list)):
-            splited_varname.append("{}.block{}".format(param_name, i))
-
-        is_distributed = True if param_name in distibuted_varnames else False
-
-        var = context['origin_main_program'].global_block().vars[
-            grad.merged_var.name]
-
-        shape = list(var.shape)
-        shape[0] = 0 if is_distributed else shape[0]
-
-        sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape,
-                                 [grad_name], trainer_id, True, True,
-                                 is_distributed, idx, False)
+    for i, program in enumerate(origin_programs):
+        merged_dense_pairs = context['merged_dense_pairs'][i]
+        idx += get_dense_send_context(program, send_ctx, idx,
+                                      merged_dense_pairs, trainer_id,
+                                      split_dense_table)
+    distibuted_varnames = get_sparse_tablenames(origin_programs, True)
+    print("public distibuted_varnames:", distibuted_varnames)
+    for i, program in enumerate(origin_programs):
+        merged_sparse_pairs = context['merged_sparse_pairs'][i]
+        for merged in merged_sparse_pairs:
+            param, grad = merged
+            grad_name = grad.merged_var.name
+            param_name = param.merged_var.name
+            splited_varname = []
+
+            for i in range(len(ep_list)):
+                splited_varname.append("{}.block{}".format(param_name, i))
+
+            is_distributed = True if param_name in distibuted_varnames else False
+
+            var = program.global_block().vars[grad.merged_var.name]
+
+            shape = list(var.shape)
+            shape[0] = 0 if is_distributed else shape[0]
+
+            print("public get_the_one_send_context sparse:", grad_name,
+                  splited_varname, shape)
+            if grad_name in send_ctx:
+                continue
+            sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape,
+                                     [grad_name], trainer_id, True, True,
+                                     is_distributed, idx, False, False,
+                                     id(program))
 
-        idx += 1
-        send_ctx[sparse_ctx.var_name()] = sparse_ctx
+            idx += 1
+            send_ctx[sparse_ctx.var_name()] = sparse_ctx
 
     if len(context['tensor_table']) > 0 and context['is_worker']:
         name, ctx = _step_ctx(idx, context['role_maker'])
@@ -1073,7 +1177,7 @@ def get_the_one_recv_context(context,
 
             param_names = []
             for grad_varname in origin_grad_varnames:
-                param_name = grad_name_to_param_name[grad_varname]
+                param_name = context["grad_name_to_param_name"][grad_varname]
                 param_names.append(param_name)
             recv_id_maps[ctx.table_id()] = param_names
     else:
@@ -1090,7 +1194,7 @@ def get_the_one_recv_context(context,
 
             param_names = []
             for grad_varname in origin_grad_varnames:
-                param_name = grad_name_to_param_name[grad_varname]
+                param_name = context["grad_name_to_param_name"][grad_varname]
                 param_names.append(param_name)
             recv_id_maps[ctx.table_id()] = param_names
     return recv_id_maps
@@ -1141,58 +1245,88 @@ class MergedVariable:
 
 
 def build_var_distributed(context):
-    sparse_pairs, dense_pairs = get_param_grads(context['origin_main_program'])
-    origin_for_sparse = []
-    origin_for_dense = []
-    param_name_grad_name = {}
+    origin_programs = context['origin_main_programs']
+
+    param_name_to_grad_name = {}
     grad_name_to_param_name = {}
-    context["merged_variables_pairs"] = []
+    context["origin_sparse_pairs"] = []
+    context["origin_dense_pairs"] = []
     context["merged_sparse_pairs"] = []
     context['merged_dense_pairs'] = []
+    context["merged_variables_pairs"] = []
     context["merged_variable_map"] = {}
-
-    for param, grad in sparse_pairs:
-        origin_for_sparse.append((param, grad))
-
-    for param, grad in dense_pairs:
-        origin_for_dense.append((param, grad))
-
-    for dense_pair in origin_for_dense:
-        param, grad = dense_pair
-
-        m_param = MergedVariable(param, [param], [0])
-        m_grad = MergedVariable(grad, [grad], [0])
-        context["merged_variables_pairs"].append((m_param, m_grad))
-        context["merged_dense_pairs"].append((m_param, m_grad))
-
-    for sparse_pair in origin_for_sparse:
-        param, grad = sparse_pair
-
-        m_param = MergedVariable(param, [param], [0])
-        m_grad = MergedVariable(grad, [grad], [0])
-        context["merged_variables_pairs"].append((m_param, m_grad))
-        context["merged_sparse_pairs"].append((m_param, m_grad))
-
-    for merged in context["merged_variables_pairs"]:
-        m_param, m_grad = merged
-        context["merged_variable_map"][
-            m_param.merged_var.name] = m_param.merged_var
-        context["merged_variable_map"][
-            m_grad.merged_var.name] = m_grad.merged_var
-
-    param_merges = []
-    param_merges.extend(origin_for_sparse)
-    param_merges.extend(origin_for_dense)
-
-    for param, grad in param_merges:
-        param_name_grad_name[param.name] = grad.name
-        grad_name_to_param_name[grad.name] = param.name
-
-    context["origin_sparse_pairs"] = origin_for_sparse
-    context["origin_dense_pairs"] = origin_for_dense
-    context["param_name_to_grad_name"] = param_name_grad_name
+    for origin_program in origin_programs:
+        sparse_pairs, dense_pairs = get_param_grads(origin_program)
+        print("public build_var_distributed sparse_pairs:", sparse_pairs)
+        print("public build_var_distributed dense_pairs:", dense_pairs)
+        origin_for_sparse = []
+        origin_for_dense = []
+        merged_sparse_pairs = []
+        merged_dense_pairs = []
+        merged_variables_pairs = []
+
+        for param, grad in sparse_pairs:
+            origin_for_sparse.append((param, grad))
+
+        for param, grad in dense_pairs:
+            origin_for_dense.append((param, grad))
+
+        for dense_pair in origin_for_dense:
+            param, grad = dense_pair
+
+            m_param = MergedVariable(param, [param], [0])
+            m_grad = MergedVariable(grad, [grad], [0])
+            merged_variables_pairs.append((m_param, m_grad))
+            merged_dense_pairs.append((m_param, m_grad))
+        print("public build_var_distributed merged_dense_pairs:",
+              merged_dense_pairs)
+
+        for sparse_pair in origin_for_sparse:
+            param, grad = sparse_pair
+
+            m_param = MergedVariable(param, [param], [0])
+            m_grad = MergedVariable(grad, [grad], [0])
+            merged_variables_pairs.append((m_param, m_grad))
+            merged_sparse_pairs.append((m_param, m_grad))
+        print("public build_var_distributed merged_sparse_pairs:",
+              merged_sparse_pairs)
+
+        for merged in merged_variables_pairs:
+            m_param, m_grad = merged
+            context["merged_variable_map"][
+                m_param.merged_var.name] = m_param.merged_var
+            context["merged_variable_map"][
+                m_grad.merged_var.name] = m_grad.merged_var
+
+        param_merges = []
+        param_merges.extend(origin_for_sparse)
+        param_merges.extend(origin_for_dense)
+
+        for param, grad in param_merges:
+            param_name_to_grad_name[param.name] = grad.name
+            grad_name_to_param_name[grad.name] = param.name
+
+        context["origin_sparse_pairs"].append(origin_for_sparse)
+        context["origin_dense_pairs"].append(origin_for_dense)
+        context["merged_sparse_pairs"].append(merged_sparse_pairs)
+        context['merged_dense_pairs'].append(merged_dense_pairs)
+
+    context["param_name_to_grad_name"] = param_name_to_grad_name
     context["grad_name_to_param_name"] = grad_name_to_param_name
 
+    print("public build_var_distributed origin_sparse_pairs:",
+          context["origin_sparse_pairs"])
+    print("public build_var_distributed origin_for_dense:",
+          context["origin_dense_pairs"])
+    print("public build_var_distributed merged_sparse_pairs:",
+          context["merged_sparse_pairs"])
+    print("public build_var_distributed merged_dense_pairs:",
+          context['merged_dense_pairs'])
+    print("public build_var_distributed param_name_to_grad_name:",
+          param_name_to_grad_name)
+    print("public build_var_distributed grad_name_to_param_name:",
+          grad_name_to_param_name)
+
 
 def _is_opt_role_op(op):
     # NOTE : depend on oprole to find out whether this op is for
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 7e3dfde5d4f6758bba028c2ad76763e9ad732750..b8a696057e7800d9f7d3298762945462861e6b4b 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -502,9 +502,6 @@ class IpuStrategy(object):
     """
     Help users precisely control the graph building in :code:`paddle.static.IpuCompiledProgram` .
 
-    Args:
-        None.
-        
     Returns:
         The IpuStrategy instance.
 
@@ -517,23 +514,36 @@ class IpuStrategy(object):
             import paddle.static as static
 
             paddle.enable_static()
+
             ipu_strategy = static.IpuStrategy()
     """
 
     def __init__(self):
         if core.is_compiled_with_ipu():
             self._ipu_strategy = core.IpuStrategy()
+            default_options = {
+                'location_optimizer': {
+                    'on_chip': 0,
+                    'use_replicated_tensor_sharding': 1,
+                },  # set optimizer location
+                'accumulation_and_replication_reduction_type':
+                1,  # popart::ReductionType::Mean
+                'mean_accumulation_and_replication_reduction_strategy':
+                1,  # popart::MeanReductionStrategy::Post
+            }
+            self._ipu_strategy.set_options(default_options)
+            self.has_custom_ops = False
+            self.custom_op_names = []
         else:
             raise RuntimeError(
                 "Can not use IpuStrategy in non IPU compiled environment, please re-compile with WITH_IPU=ON."
             )
 
-    def SetGraphConfig(self,
-                       num_ipus=1,
-                       is_training=True,
-                       batch_size=1,
-                       enable_manual_shard=False,
-                       need_avg_shard=False):
+    def set_graph_config(self,
+                         num_ipus=1,
+                         is_training=True,
+                         batch_size=1,
+                         enable_manual_shard=False):
         """
         Set graph configuration to the IpuStrategy instance.
 
@@ -544,8 +554,6 @@ class IpuStrategy(object):
                 if the batch-size in the graph is dynamic. Default 1, which means the batch-size would be set 1, if the batch-size is dynamice.
             enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True. 
                 Default False, which means disabled.    
-            need_avg_shard (bool, optional): Enable auto graph sharding or not. Only if num_ipus > 1 and enable_manual_shard=True, need_avg_shard is able to be set Trues. 
-                Default False, which means disabled.
             
         Returns:
             None.
@@ -559,32 +567,29 @@ class IpuStrategy(object):
                 import paddle.static as static
 
                 paddle.enable_static()
+
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(num_ipus=1,
+                ipu_strategy.set_graph_config(num_ipus=1,
                                             is_training=True,
                                             batch_size=1,
-                                            enable_manual_shard=False,
-                                            need_avg_shard=False)
+                                            enable_manual_shard=False)
         """
-
-        self._ipu_strategy.num_ipus = num_ipus
-        self._ipu_strategy.is_training = is_training
-        self._ipu_strategy.batch_size = batch_size
-        self._ipu_strategy.enable_manual_shard = enable_manual_shard
-        if self._ipu_strategy.num_ipus == 1 and self._ipu_strategy.enable_manual_shard:
+        if num_ipus == 1 and enable_manual_shard:
             raise RuntimeError(
                 "Only if num_ipus > 1, enable_manual_shard is able to be set True."
             )
-        self._ipu_strategy.need_avg_shard = need_avg_shard
-        if self._ipu_strategy.enable_manual_shard != True and self._ipu_strategy.need_avg_shard:
-            raise RuntimeError(
-                "Only if enable_manual_shard=True, need_avg_shard is able to be set True."
-            )
-
-    def SetPipeliningConfig(self,
-                            enable_pipelining=False,
-                            batches_per_step=1,
-                            accumulationFactor=1):
+        options = {
+            'num_ipus': num_ipus,
+            'is_training': is_training,
+            'micro_batch_size': batch_size,
+            'enable_manual_shard': enable_manual_shard,
+        }
+        self.set_options(options)
+
+    def set_pipelining_config(self,
+                              enable_pipelining=False,
+                              batches_per_step=1,
+                              accumulation_factor=1):
         """
         Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance.
 
@@ -593,7 +598,7 @@ class IpuStrategy(object):
                 Default False, which means disabled.
             batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1.
                 Default 1, which means no data pipelining.
-            accumulationFactor (int, optional): Specify the number of micro-batches to accumulate 
+            accumulation_factor (int, optional): Specify the number of micro-batches to accumulate 
                 before applying the varUpdate. Default 1, which means disable the accumulation.
         
         Returns:
@@ -610,23 +615,23 @@ class IpuStrategy(object):
                 paddle.enable_static()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetPipeliningConfig(enable_pipelining=False,
-                                                 batches_per_step=1,
-                                                 accumulationFactor=1)
+                ipu_strategy.set_pipelining_config(enable_pipelining=False,
+                                                    batches_per_step=1,
+                                                    accumulation_factor=1)
         """
-        self._ipu_strategy.enable_pipelining = enable_pipelining
-        if self._ipu_strategy.enable_manual_shard != True and self._ipu_strategy.enable_pipelining:
+        enable_manual_shard = self.get_option('enable_manual_shard')
+        if not enable_manual_shard and enable_pipelining:
             raise RuntimeError(
                 "Only if enable_manual_shard=True, enable_pipelining is able to be set True."
             )
-        self._ipu_strategy.batches_per_step = batches_per_step
-        if self._ipu_strategy.enable_pipelining != True and self._ipu_strategy.batches_per_step > 1:
-            raise RuntimeError(
-                "Only if enable_pipelining=True, batches_per_step is able to be set > 1."
-            )
-        self._ipu_strategy.accumulationFactor = accumulationFactor
-
-    def SetHalfConfig(self, enable_fp16=False):
+        options = {
+            'enable_pipelining': enable_pipelining,
+            'batches_per_step': batches_per_step,
+            'accumulation_factor': accumulation_factor,
+        }
+        self.set_options(options)
+
+    def set_precision_config(self, enable_fp16=False):
         """
         Set half computation configuration to the IpuStrategy instance. Used to optimize the performance.
 
@@ -647,73 +652,135 @@ class IpuStrategy(object):
                 paddle.enable_static()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetHalfConfig(enable_fp16=False)
+                ipu_strategy.set_precision_config(enable_fp16=False)
+        """
+        options = {'enable_fp16': enable_fp16, }
+        self.set_options(options)
+
+    def add_custom_op(self,
+                      paddle_op,
+                      popart_op=None,
+                      domain='custom.ops',
+                      version=1):
         """
+        Add a mapping to use popart custom ops running on the IPU.
 
-        self._ipu_strategy.enable_fp16 = enable_fp16
+        Args:
+            paddle_op(str): the name of custom op in paddle.
 
-    @property
-    def num_ipus(self):
-        """
-        Get the number of IPU devices from IpuStrategy instance.
-        """
-        return self._ipu_strategy.num_ipus
+            popart_op(str): the name of custom op in popart.
 
-    @property
-    def is_training(self):
-        """
-        Get the boolean of training or inference from IpuStrategy instance.
-        """
-        return self._ipu_strategy.is_training
+            domain(str): domain name of custom op in popart.
 
-    @property
-    def batch_size(self):
+            version(int): version of custom op in popart.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                ipu_strategy.add_custom_op('paddle_relu', 'popart_relu')
         """
-        Get the batch_size used in dynamic batch_size graph from IpuStrategy instance.
+        if popart_op is None:
+            popart_op = paddle_op
+        custom_op = {
+            'paddle_op': paddle_op,
+            'popart_op': popart_op,
+            'domain': domain,
+            'version': version,
+        }
+        self.set_options({'custom_op': custom_op})
+        self.custom_op_names.append(paddle_op)
+        if not self.has_custom_ops:
+            self.has_custom_ops = True
+
+    def set_options(self, options):
         """
-        return self._ipu_strategy.batch_size
+        Set options from dict.
 
-    @property
-    def enable_manual_shard(self):
-        """
-        Get the boolean of enable manual shard or not from IpuStrategy instance.
+        Args:
+            options(dict): dict of options.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                options = {'num_ipus':1, 'enable_fp16': True}
+                ipu_strategy.set_options(options)
         """
-        return self._ipu_strategy.enable_manual_shard
+        self._ipu_strategy.set_options(options)
 
-    @property
-    def need_avg_shard(self):
+    def get_option(self, option):
         """
-        Get the boolean of need average shard or not from IpuStrategy instance.
+        Get option.
+
+        Args:
+            option(str): name of option.
+        
+        Returns:
+            option value.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                num_ipus = ipu_strategy.get_option('num_ipus')
         """
-        return self._ipu_strategy.need_avg_shard
+        return self._ipu_strategy.get_option(option)['value']
 
     @property
-    def enable_pipelining(self):
+    def num_ipus(self):
         """
-        Get the boolean of enable pipelining or not from IpuStrategy instance.
+        Get the number of IPU devices from IpuStrategy instance.
         """
-        return self._ipu_strategy.enable_pipelining
+        return self.get_option('num_ipus')
 
     @property
-    def batches_per_step(self):
+    def is_training(self):
         """
-        Get the number of batch_size per run in the pipelining mode from IpuStrategy instance.
+        Get the boolean of training or inference from IpuStrategy instance.
         """
-        return self._ipu_strategy.batches_per_step
+        return self.get_option('is_training')
 
     @property
-    def accumulationFactor(self):
+    def enable_pipelining(self):
         """
-        Get the number of micro-batches to accumulate before applying the varUpdate from IpuStrategy instance.
+        Get the boolean of enable pipelining or not from IpuStrategy instance.
         """
-        return self._ipu_strategy.accumulationFactor
+        return self.get_option('enable_pipelining')
 
     @property
     def enable_fp16(self):
         """
         Get the boolean of float16 mode or not from IpuStrategy instance.
         """
-        return self._ipu_strategy.enable_fp16
+        return self.get_option('enable_fp16')
 
 
 class IpuCompiledProgram(object):
@@ -750,9 +817,9 @@ class IpuCompiledProgram(object):
             main_prog = static.default_main_program()
             
             ipu_strategy = static.IpuStrategy()
-            ipu_strategy.SetGraphConfig(num_ipus=1, is_training=True, batch_size=1)
-            ipu_strategy.SetPipeliningConfig(enable_pipelining=False, batches_per_step=1, accumulationFactor=1)
-            ipu_strategy.SetHalfConfig(enable_fp16=False)
+            ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
+            ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+            ipu_strategy.set_precision_config(enable_fp16=False)
             
             ipu_compiled_program = static.IpuCompiledProgram(
                 main_prog,
@@ -766,14 +833,12 @@ class IpuCompiledProgram(object):
             )
 
         if program is None:
-            program = default_main_program()
+            program = framework.default_main_program()
 
         if not isinstance(program, framework.Program):
             raise TypeError(
                 "The type of program is wrong, expected Program, but got %s" %
                 type(program))
-        # import here to avoiding confused
-        import paddle
 
         self._program = program
         self._compiled = False
@@ -781,23 +846,21 @@ class IpuCompiledProgram(object):
         if scope is not None:
             self._scope = scope
         else:
+            # import here to avoiding confused
+            import paddle
             self._scope = paddle.static.global_scope()
 
         if ipu_strategy is not None:
-            self._ipu_strategy = ipu_strategy._ipu_strategy
+            self._ipu_strategy = ipu_strategy
         else:
-            self._ipu_strategy = core.IpuStrategy()
+            self._ipu_strategy = IpuStrategy()
 
-        self._backend = core.IpuBackend()
-        self._backend.set_scope(self._scope)
-        self._backend.set_ipu_strategy(self._ipu_strategy)
-        self._graph_passes = [
-            "optimizer_extract_pass", "optimizer_state_align_pass",
-            "forward_graph_extract_pass", "infer_shape_pass", "avg_shard_pass",
-            "popart_canonicalization_pass"
-        ]
-        global ipu_compiler_ref
-        ipu_compiler_ref = self
+        if ipu_strategy.has_custom_ops:
+            self._custom_op_names = set(ipu_strategy.custom_op_names)
+        else:
+            self._custom_op_names = ()
+
+        self._backend = core.IpuBackend.get_instance()
 
     def compile(self, feed_list, fetch_list):
         """
@@ -828,20 +891,23 @@ class IpuCompiledProgram(object):
                 main_prog = static.default_main_program()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(num_ipus=1, is_training=True, batch_size=1)
-                ipu_strategy.SetPipeliningConfig(enable_pipelining=False, batches_per_step=1, accumulationFactor=1)
-                ipu_strategy.SetHalfConfig(enable_fp16=False)
+                ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
+                ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+                ipu_strategy.set_precision_config(enable_fp16=False)
                 
                 program = static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile([a.name], [b.name])
         """
+        self._backend.set_scope(self._scope)
+        self._backend.set_ipu_strategy(self._ipu_strategy._ipu_strategy)
+
         # feed and fetch doesn't have corresponding popart op, so we rm both here
         global_block = self._program.global_block()
         need_to_remove_op_index = []
         for i, op in enumerate(global_block.ops):
             op.desc.set_is_target(False)
-            if op.type == "feed" or op.type == "fetch":
+            if op.type == 'feed' or op.type == 'fetch':
                 need_to_remove_op_index.append(i)
 
         for index in need_to_remove_op_index[::-1]:
@@ -854,26 +920,45 @@ class IpuCompiledProgram(object):
         self._program.desc.flush()
         self._graph = core.Graph(self._program.desc)
 
-        for pass_name in self._graph_passes:
-            graph_pass = core.get_pass(pass_name)
-            if pass_name == "infer_shape_pass":
-                graph_pass.set("feed_list", feed_list)
-            graph_pass.apply(self._graph)
-
-        ipu_inplace_pass = core.get_pass("ipu_inplace_pass")
-        ipu_inplace_pass.set("feed_list", feed_list)
-        ipu_inplace_pass.set("fetch_list", fetch_list)
-        ipu_inplace_pass.apply(self._graph)
-
-        ipu_graph_builder_pass = core.get_pass("ipu_graph_builder_pass")
-        ipu_graph_builder_pass.set("feed_list", feed_list)
-        ipu_graph_builder_pass.set("fetch_list", fetch_list)
-        ipu_graph_builder_pass.apply(self._graph)
-
-        ipu_runtime_replacer_pass = core.get_pass("ipu_runtime_replacer_pass")
-        ipu_runtime_replacer_pass.set("feed_list", feed_list)
-        ipu_runtime_replacer_pass.set("fetch_list", fetch_list)
-        ipu_runtime_replacer_pass.apply(self._graph)
+        if self._ipu_strategy.is_training:
+            passes = [
+                'optimizer_extract_pass',
+                'optimizer_state_align_pass',
+            ]
+            for pass_name in passes:
+                a_pass = core.get_pass(pass_name)
+                a_pass.apply(self._graph)
+
+        passes = [
+            'forward_graph_extract_pass',
+            'infer_shape_pass',
+            'avg_shard_pass',
+            'delete_scale_op_pass',
+        ]
+        for pass_name in passes:
+            a_pass = core.get_pass(pass_name)
+            if pass_name == 'infer_shape_pass':
+                a_pass.set('feed_list', feed_list)
+            a_pass.apply(self._graph)
+
+        a_pass = core.get_pass('popart_canonicalization_pass')
+        if self._custom_op_names:
+            a_pass.set('custom_ops', self._custom_op_names)
+        a_pass.apply(self._graph)
+
+        a_pass = core.get_pass("transfer_cast_op_pass")
+        a_pass.apply(self._graph)
+
+        passes = [
+            'ipu_inplace_pass',
+            'ipu_graph_builder_pass',
+            'ipu_runtime_replacer_pass',
+        ]
+        for pass_name in passes:
+            a_pass = core.get_pass(pass_name)
+            a_pass.set('feed_list', feed_list)
+            a_pass.set('fetch_list', fetch_list)
+            a_pass.apply(self._graph)
 
         convert_pass = core.get_pass('graph_to_program_pass')
         desc = core.ProgramDesc()
@@ -904,9 +989,3 @@ class IpuCompiledProgram(object):
             program.org_program = self._program
 
         return program
-
-    def clean(self):
-        self._backend.clear()
-
-    def __del__(self):
-        self.clean()
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 01d64550321d5e96d3dddeebb2509e3d96f3237b..f43a51063b00ac0439aacfbf46ff593e7b1b4f43 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -75,9 +75,16 @@ PURE_FP16_BLACK_LIST = {
     'lookup_table', 'lookup_table_v2', 'scatter', 'scatter_grad'
 }
 
-BF16_WHITE_LIST = {'conv2d'}
+BF16_WHITE_LIST = {'conv2d', 'matmul_v2'}
 BF16_BLACK_LIST = {' '}
 
+_g_amp_state_ = None
+
+
+def amp_state():
+    global _g_amp_state_
+    return _g_amp_state_
+
 
 #NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
 # The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
@@ -149,7 +156,7 @@ def _is_gpu_bfloat16_supported():
     """
     prop = paddle.device.cuda.get_device_capability()
     cuda_version = paddle.version.cuda()
-    if cuda_version is not None:
+    if cuda_version is not None and cuda_version != 'False':
         cuda_version_check = int(cuda_version.split('.')[0]) >= 11
     else:
         cuda_version_check = False
@@ -161,7 +168,7 @@ def pure_fp16_initialize(models):
     for idx in range(len(models)):
         for layer in models[idx].sublayers(include_self=True):
             layer._casted_by_pure_fp16 = True
-            if (layer._dtype is 'float16') or isinstance(
+            if (layer._dtype == 'float16') or isinstance(
                     layer, (paddle.nn.BatchNorm, paddle.nn.BatchNorm1D,
                             paddle.nn.BatchNorm2D, paddle.nn.BatchNorm3D,
                             paddle.nn.LayerNorm)):
@@ -240,6 +247,11 @@ def amp_guard(enable=True,
                 print(conv.dtype) # FP32
 
     """
+    amp_state = locals()
+    global _g_amp_state_
+    original_state = _g_amp_state_
+    _g_amp_state_ = amp_state
+
     # check amp_level: O0-O2
     level = level.upper()
     if not (level in ['O0', 'O1', 'O2']):
@@ -349,6 +361,7 @@ def amp_guard(enable=True,
         yield
     finally:
         if tracer:
+            _g_amp_state_ = original_state
             tracer._amp_level = original_amp_level
             tracer._set_amp_op_list(original_white_list, original_black_list)
             # set_flags(original_flags)
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index a98dc5a79aec3df77729908c8aa7482606dd9766..3776599daab164e47b195bb2d9a269936d2d3774 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import os
 import collections
 import functools
-from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer
+from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer, EagerParamBase
 import pickle
 from . import learning_rate_scheduler
 import warnings
@@ -94,7 +94,7 @@ def save_dygraph(state_dict, model_path):
 
     param_num = 0
     for k, v in state_dict.items():
-        if isinstance(v, ParamBase):
+        if isinstance(v, (ParamBase, EagerParamBase)):
             param_num += 1
 
     if param_num == 0:
@@ -103,7 +103,7 @@ def save_dygraph(state_dict, model_path):
     model_dict = {}
     name_table = {}
     for k, v in state_dict.items():
-        if isinstance(v, (Variable, core.VarBase)):
+        if isinstance(v, (Variable, core.VarBase, core.eager.Tensor)):
             model_dict[k] = v.numpy()
             name_table[k] = v.name
         else:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index 4d9ed5916adfd79013be1d8d1bb90f3c44428b49..4a6d855a893f648fe6ab9e1af28910d6ab9f129f 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -143,9 +143,10 @@ class TranslatorLogger(object):
             self._output_to_stdout('ERROR: ' + msg, *args)
 
     def warn(self, msg, *args, **kwargs):
-        self.logger.warning(msg, *args, **kwargs)
-        if self.need_to_echo_log_to_stdout:
-            self._output_to_stdout('WARNING: ' + msg, *args)
+        if self.verbosity_level != -1:
+            self.logger.warning(msg, *args, **kwargs)
+            if self.need_to_echo_log_to_stdout:
+                self._output_to_stdout('WARNING: ' + msg, *args)
 
     def log(self, level, msg, *args, **kwargs):
         if self.has_verbosity(level):
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 9ffdea969be5d4691e3fbf271f17d08db6667647..aad7737350961b34badb89b0b8f542d0037b8c98 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -535,12 +535,20 @@ def _load_persistable_vars_by_program(model_path,
         orig_each_name = program_holder._suffix_varname_dict[each_var.name()]
         if _is_parameter(each_var, program_holder.infer_program):
             # create output varbase
-            new_var = framework.ParamBase(
-                shape=each_var.shape(),
-                dtype=each_var.dtype(),
-                name=each_var.name(),
-                type=each_var.type(),
-                persistable=True)
+            if framework._in_eager_mode():
+                new_var = framework.EagerParamBase(
+                    shape=each_var.shape(),
+                    dtype=each_var.dtype(),
+                    name=each_var.name(),
+                    type=each_var.type(),
+                    persistable=True)
+            else:
+                new_var = framework.ParamBase(
+                    shape=each_var.shape(),
+                    dtype=each_var.dtype(),
+                    name=each_var.name(),
+                    type=each_var.type(),
+                    persistable=True)
         else:
             new_var = framework._varbase_creator(
                 type=each_var.type(),
@@ -620,11 +628,22 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
         # create output varbase
         if extra_var_info[name].get('trainable', None) is not None:
             # use default shape and dtype
-            new_var = framework.ParamBase(
-                shape=[1],  # only to pass check, this shape is not meaningful
-                dtype=core.VarDesc.VarType.FP32,
-                name=new_name,
-                persistable=True)
+            if framework._in_eager_mode():
+                new_var = framework.EagerParamBase(
+                    shape=[
+                        1
+                    ],  # only to pass check, this shape is not meaningful
+                    dtype=core.VarDesc.VarType.FP32,
+                    name=new_name,
+                    persistable=True)
+            else:
+                new_var = framework.ParamBase(
+                    shape=[
+                        1
+                    ],  # only to pass check, this shape is not meaningful
+                    dtype=core.VarDesc.VarType.FP32,
+                    name=new_name,
+                    persistable=True)
         else:
             new_var = framework._varbase_creator(
                 name=new_name, persistable=True)
@@ -747,18 +766,26 @@ def _run_dygraph(instance, input, program_holder):
     # 1. prepare inputs, outputs, attrs
     input_vars = []
     for i, value in enumerate(input):
-        if not isinstance(value, (np.ndarray, core.VarBase)):
+        if not isinstance(value, (np.ndarray, core.VarBase, core.eager.Tensor)):
             raise TypeError(
                 "The type of input in TranslatedLayer must be numpy array or Variable(VarBase), but received %s."
                 % type(value))
         # NOTE: In order to unify the API, firstly convert the input to VarBase
         if isinstance(value, np.ndarray):
-            var = core.VarBase(
-                value=value,
-                name=program_holder.input_descs[i].name(),
-                persistable=False,
-                place=framework._current_expected_place(),
-                zero_copy=True)
+            if framework._in_eager_mode():
+                var = core.eager.Tensor(
+                    value=value,
+                    name=program_holder.input_descs[i].name(),
+                    persistable=False,
+                    place=framework._current_expected_place(),
+                    zero_copy=True)
+            else:
+                var = core.VarBase(
+                    value=value,
+                    name=program_holder.input_descs[i].name(),
+                    persistable=False,
+                    place=framework._current_expected_place(),
+                    zero_copy=True)
         else:
             var = value
             # NOTE: we changed var name here, 
@@ -784,30 +811,62 @@ def _run_dygraph(instance, input, program_holder):
 
     output_vars = []
     for var_desc in program_holder.output_descs:
-        var = core.VarBase(var_desc.dtype(),
-                           var_desc.shape(),
-                           var_desc.name(), var_desc.type(), False)
+        if framework._in_eager_mode():
+            var = core.eager.Tensor(
+                dtype=var_desc.dtype(),
+                dims=var_desc.shape(),
+                name=var_desc.name(),
+                type=var_desc.type(),
+                persistable=False)
+        else:
+            var = core.VarBase(var_desc.dtype(),
+                               var_desc.shape(),
+                               var_desc.name(), var_desc.type(), False)
         output_vars.append(var)
 
     # hold forward variables
-    tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
-                                 "program_out_scope",
-                                 core.VarDesc.VarType.STEP_SCOPES, True)
+    if framework._in_eager_mode():
+        tmp_scope_vec = core.eager.Tensor(
+            dtype=core.VarDesc.VarType.FP32,
+            dims=[],
+            name="program_out_scope",
+            type=core.VarDesc.VarType.STEP_SCOPES,
+            persistable=True)
+    else:
+        tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
+                                     "program_out_scope",
+                                     core.VarDesc.VarType.STEP_SCOPES, True)
     tmp_scope_vec.value().set_scope(program_holder.scope)
 
     double_grad_vars = []
     for var_desc in program_holder.double_grad_descs:
-        var = core.VarBase(var_desc.dtype(),
-                           var_desc.shape(),
-                           var_desc.name(), var_desc.type(), False)
+        if framework._in_eager_mode():
+            var = core.eager.Tensor(
+                dtype=var_desc.dtype(),
+                dims=var_desc.shape(),
+                name=var_desc.name(),
+                type=var_desc.type(),
+                persistable=False)
+        else:
+            var = core.VarBase(var_desc.dtype(),
+                               var_desc.shape(),
+                               var_desc.name(), var_desc.type(), False)
         double_grad_vars.append(var)
     if len(double_grad_vars) == 0:
-        double_grad_vars = [
-            core.VarBase(
-                value=[1],
-                name='Fake_var',
-                place=framework._current_expected_place())
-        ]
+        if framework._in_eager_mode():
+            double_grad_vars = [
+                core.eager.Tensor(
+                    value=[1],
+                    name='Fake_var',
+                    place=framework._current_expected_place())
+            ]
+        else:
+            double_grad_vars = [
+                core.VarBase(
+                    value=[1],
+                    name='Fake_var',
+                    place=framework._current_expected_place())
+            ]
 
     # 2. run program by op
     trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
@@ -1215,11 +1274,12 @@ class TranslatedLayer(layers.Layer):
         # the TranslatedLayer object holded var names count started from 0
         with unique_name.guard():
             for name, var in persistable_vars.items():
-                if isinstance(var, framework.ParamBase):
+                if isinstance(var,
+                              (framework.ParamBase, framework.EagerParamBase)):
                     dy_name = _generate_unique_var_name(PARAMETER_NAME_PREFIX)
                     self._persistable_var_name_dict[name] = dy_name
                     self.add_parameter(dy_name, var)
-                elif isinstance(var, core.VarBase):
+                elif isinstance(var, (core.VarBase, core.eager.Tensor)):
                     dy_name = _generate_unique_var_name(BUFFER_NAME_PREFIX)
                     self._persistable_var_name_dict[name] = dy_name
                     self.register_buffer(dy_name, var)
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index e1857a34f03f514e04e83e9596c9826569e2a90d..5bb1aef6d6e9b96a8492fe9fc76c7448a053e3bf 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -240,7 +240,8 @@ class Conv2D(layers.Layer):
             is_bias=True)
 
     def forward(self, input):
-        if in_dygraph_mode() and self._l_type == 'conv2d':
+        if in_dygraph_mode() and (self._l_type == 'conv2d' or
+                                  self._l_type == 'depthwise_conv2d'):
             attrs = ('strides', self._stride, 'paddings', self._padding,
                      'dilations', self._dilation, 'groups', self._groups
                      if self._groups else 1, 'use_cudnn', self._use_cudnn,
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 6f0305f4774d6429951ee69a5b3a9db1bed18131..65bfba3f6c32e072a6db0e1d294a8c5fc07d9d74 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -33,10 +33,11 @@ import paddle.utils.deprecated as deprecated
 class TensorHookRemoveHelper(object):
     """
     A helper class that for removing Tensor gradient's hook.
+    NOTE(wuweilong):the operation weakref.ref(tensor) will cause some unexpected errors in eager mode.
     """
 
     def __init__(self, tensor, hook_id):
-        self._tensor_ref = weakref.ref(tensor)
+        self._tensor = tensor if core._in_eager_mode() else weakref.ref(tensor)
         self._hook_id = hook_id
 
     def remove(self):
@@ -46,7 +47,7 @@ class TensorHookRemoveHelper(object):
         Returns:
             bool: Return True if removed successfully
         """
-        tensor = self._tensor_ref()
+        tensor = self._tensor if core._in_eager_mode() else self._tensor()
         if tensor is not None:
             res = tensor._remove_grad_hook(self._hook_id)
             if res is True:
@@ -779,13 +780,6 @@ def monkey_patch_varbase():
             raise TypeError(
                 "_set_grad_ivar is only supported for Parameter Tensor")
 
-    @framework.dygraph_only
-    def clear_gradient(self, set_to_zero=True):
-        if set_to_zero:
-            self._zero_grads()
-        else:
-            self._clear_gradient()
-
     @framework.dygraph_only
     def clone(self):
         return _C_ops_.assign(self)
@@ -815,7 +809,6 @@ def monkey_patch_varbase():
     if core._in_eager_mode():
         setattr(core.eager.Tensor, "_grad_ivar", _grad_ivar)
         setattr(core.eager.Tensor, "_set_grad_ivar", _set_grad_ivar)
-        setattr(core.eager.Tensor, "clear_gradient", clear_gradient)
         setattr(core.eager.Tensor, "clone", clone)
         setattr(core.eager.Tensor, "value", value)
     else:
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 5ae1403f632b650d0b5166af31ae49d75bf7cf3a..e372727b0f0b6a338cd43ac81001bb32ffd03ecc 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -622,7 +622,9 @@ class Executor(object):
             is CPU version, the default device would be set to `CPUPlace()` . If Paddle is
             GPU version, the default device would be set to `CUDAPlace(0)` . Default is None.
             If ``place`` is string, it can be ``cpu``, and ``gpu:x``, where ``x`` 
-            is the index of the GPUs.
+            is the index of the GPUs. Note: users only pass one Place or None to initialize
+            Executor when using multiple-cards. Other APIs will override the cards. See
+            `document for multiple-cards <https://www.paddlepaddle.org.cn/documentation/docs/en/develop/guides/01_paddle2.0_introduction/update_en.html#stand-alone-multi-card-launch>`_ 
 
     Returns:
         Executor
@@ -1581,9 +1583,6 @@ class Executor(object):
             lr_sheduler = program.lr_sheduler
             lr_value = lr_sheduler()
             lr_var = program.global_block().vars[lr_sheduler._var_name]
-            if core.is_compiled_with_ipu():
-                if hasattr(program.lr_sheduler, 'lr_var'):
-                    lr_var = program.lr_sheduler.lr_var
             data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
             tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
             tensor.set(data, self.place)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 780b8acc4fde67f4b47589869b258dd99a022125..d0a94238a7aeb21f9d1baf8154cbe3b7f2b77a72 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2544,7 +2544,7 @@ class Operator(object):
                     warnings.warn("The Op(%s) is not support to set device." %
                                   type)
                 if 'force_cpu' in op_attrs:
-                    if (type is 'less_than' and op_attrs['force_cpu'] != None
+                    if (type == 'less_than' and op_attrs['force_cpu'] != None
                         ) or op_attrs['force_cpu'] != False:
                         warnings.warn(
                             "The Attr(force_cpu) of Op(%s) will be deprecated in the future, "
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index 4b8c7ccbb69cfcbea8c4db99b191b84b0d4ac1fd..b6ec09bab7254e1f88fb2ef4cea8a330ab216f08 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -577,7 +577,7 @@ class CompileTimeStrategy(object):
                 sparse_ctx = CommContext(grad_name, [grad_name],
                                          ["127.0.0.1:6071"], [var_numel],
                                          [grad_name], trainer_id, True, True,
-                                         is_distributed, idx, False)
+                                         is_distributed, idx, False, False, -1)
                 idx += 1
                 send_ctx[sparse_ctx.var_name()] = sparse_ctx
 
@@ -615,7 +615,8 @@ class CompileTimeStrategy(object):
             aggregate = True
             dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
                                     [var_numel], origin_varnames, trainer_id,
-                                    aggregate, False, False, idx, False)
+                                    aggregate, False, False, idx, False, False,
+                                    -1)
             send_ctx[grad_name] = dense_ctx
             idx += 1
         else:
@@ -630,7 +631,7 @@ class CompileTimeStrategy(object):
                 dense_ctx = CommContext(grad_name, [grad_name],
                                         ["127.0.0.1:6071"], [var_numel],
                                         [origin_varname], trainer_id, aggregate,
-                                        False, False, idx, False)
+                                        False, False, idx, False, False, -1)
                 send_ctx[grad_name] = dense_ctx
                 idx += 1
         return idx
@@ -672,7 +673,7 @@ class CompileTimeStrategy(object):
 
             sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape,
                                      [grad_name], trainer_id, True, True,
-                                     is_distributed, idx, False)
+                                     is_distributed, idx, False, False, -1)
 
             idx += 1
             send_ctx[sparse_ctx.var_name()] = sparse_ctx
@@ -750,7 +751,7 @@ class CompileTimeStrategy(object):
         sections = [1] * len(endpoints)
         names = [name] * len(endpoints)
         ctx = CommContext(name, names, endpoints, sections, [name], trainer_id,
-                          True, False, False, idx, True)
+                          True, False, False, idx, True, False, -1)
         return name, ctx
 
     def _create_vars_from_blocklist(self, block_list):
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index ea17d029b6cc2b16d0b426f5202076db6177b7a7..9e3add621795847f84ee9906bd5bb1d2062525bd 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -138,11 +138,11 @@ class ConstantInitializer(Initializer):
         assert isinstance(block, framework.Block)
 
         if framework.in_dygraph_mode():
-            var = _C_ops.fill_constant(
-                var, 'value',
-                float(self._value), 'force_cpu', self._force_cpu, 'dtype',
-                int(var.dtype), 'str_value',
-                str(float(self._value)), 'shape', var.shape)
+            _C_ops.fill_constant(var, 'value',
+                                 float(self._value), 'force_cpu',
+                                 self._force_cpu, 'dtype',
+                                 int(var.dtype), 'str_value',
+                                 str(float(self._value)), 'shape', var.shape)
             return None
         else:
             # fill constant should set the "str_value" to preserve precision
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d2653b75eafba2675c9e1b7de86873c72cdd0db2..f022e1791daefb7cbe18434aae8ac1dbc63d39c5 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -26,7 +26,7 @@ import six
 import paddle
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags, _in_eager_mode
 from .. import dygraph_utils
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -6254,6 +6254,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             # the shape of reshaped_3 is [6,8].
     """
     if in_dygraph_mode():
+        if _in_eager_mode():
+            tmp_tensor_type = core.eager.Tensor
+        else:
+            tmp_tensor_type = Variable
         #TODO(zhiqiu): enable inplace in dygraph mode.
         if inplace:
             warnings.warn(
@@ -6265,7 +6269,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                 for item in shape
             ]
             out, _ = _C_ops.reshape2(x, None, 'shape', shape)
-        elif isinstance(shape, Variable):
+        elif isinstance(shape, tmp_tensor_type):
             shape.stop_gradient = True
             out, _ = _C_ops.reshape2(x, shape)
         else:
@@ -11132,24 +11136,30 @@ def slice(input, axes, starts, ends):
 
         infer_flags = list(1 for i in range(len(axes)))
 
+        if _in_eager_mode():
+            tmp_tensor_type = core.eager.Tensor
+        else:
+            tmp_tensor_type = Variable
+
         if isinstance(starts, (list, tuple)):
             starts = [
-                item.numpy().item(0) if isinstance(item, Variable) else item
+                item.numpy().item(0)
+                if isinstance(item, tmp_tensor_type) else item
                 for item in starts
             ]
             attrs += ('starts', starts)
-        elif isinstance(starts, Variable):
+        elif isinstance(starts, tmp_tensor_type):
             starts_tensor = starts
             starts.stop_gradient = True
             infer_flags = list(-1 for i in range(len(axes)))
 
         if isinstance(ends, (list, tuple)):
             ends = [
-                item.numpy().item(0) if isinstance(item, Variable) else item
-                for item in ends
+                item.numpy().item(0)
+                if isinstance(item, tmp_tensor_type) else item for item in ends
             ]
             attrs += ('ends', ends)
-        elif isinstance(ends, Variable):
+        elif isinstance(ends, tmp_tensor_type):
             ends_tensor = ends
             ends_tensor.stop_gradient = True
             infer_flags = list(-1 for i in range(len(axes)))
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
index 3ae30c2f30577a683544014e1fbd5c93039351ef..b0519138ca54042e510bef87fdb49e125f11eca4 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/extension.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace paddle {
 
@@ -21,19 +21,19 @@ namespace custom_kernel {
 // Here we use dot <CPU, ANY, INT8> for test
 // This test will fail when this kernel is supported in framework
 template <typename T, typename Context>
-void Dot(const Context& dev_ctx,
-         const paddle::Tensor& x,
-         const paddle::Tensor& y,
-         paddle::Tensor* out) {
+void DotKernel(const Context& dev_ctx,
+               const phi::DenseTensor& x,
+               const phi::DenseTensor& y,
+               phi::DenseTensor* out) {
   auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
   auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
+  T* z = dev_ctx.template Alloc<T>(out);
 
   // Loop over the total N elements of both operands while sum-reducing every
   // B pairs along the way where B is the dimension of the least ordered axis
-  auto shape = x.shape();
+  auto&& d = x.dims();
   auto const N = x.numel();
-  auto const B = shape[shape.size() - 1];
+  auto const B = d[d.size() - 1];
 
   for (int j = 0; j < N / B; j++) {
     T ss = 0;
@@ -45,6 +45,7 @@ void Dot(const Context& dev_ctx,
 }  // namespace custom_kernel
 }  // namespace paddle
 
-PD_REGISTER_KERNEL(dot, CPU, ALL_LAYOUT, paddle::custom_kernel::Dot, int8_t) {
+PD_REGISTER_BUILTIN_KERNEL(
+    dot, CPU, ALL_LAYOUT, paddle::custom_kernel::DotKernel, int8_t) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::INT8);
 }
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index 5e3bd2f8ed98d1e05c75ae2aa6e6d97867e94236..3cef228d14d6eb4293f14c9e93f3f7f2945871b1 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,9 +16,28 @@ import os
 from paddle.fluid import core
 from distutils.sysconfig import get_python_lib
 from distutils.core import setup, Extension
+from setuptools.command.build_ext import build_ext
+
+
+# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
+# Avoid a gcc warning below:
+# cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
+# for C/ObjC but not for C++
+class BuildExt(build_ext):
+    def build_extensions(self):
+        if '-Wstrict-prototypes' in self.compiler.compiler_so:
+            self.compiler.compiler_so.remove('-Wstrict-prototypes')
+        super(BuildExt, self).build_extensions()
+
 
 # cc flags
-paddle_extra_compile_args = ['-std=c++14', '-shared', '-fPIC']
+paddle_extra_compile_args = [
+    '-std=c++14',
+    '-shared',
+    '-fPIC',
+    '-Wno-parentheses',
+    '-DPADDLE_WITH_CUSTOM_KERNEL',
+]
 if core.is_compiled_with_npu():
     paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
@@ -27,6 +46,14 @@ site_packages_path = get_python_lib()
 paddle_custom_kernel_include = [
     os.path.join(site_packages_path, 'paddle', 'include'),
 ]
+# include path third_party
+compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
+                                        'build/third_party')
+paddle_custom_kernel_include += [
+    os.path.join(compile_third_party_path, 'boost/src/extern_boost'),  # boost
+    os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
+    os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
+]
 
 # libs path
 paddle_custom_kernel_library_dir = [
@@ -50,4 +77,5 @@ setup(
     name='custom_kernel_dot',
     version='1.0',
     description='custom kernel fot compiling',
+    cmdclass={'build_ext': BuildExt},
     ext_modules=[custom_kernel_dot_module])
diff --git a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
index 6e0b44b71f7f87447bd66a052f0a394ab38b2874..76158596cb815022bc1a92cde75c9bd51be92857 100644
--- a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 #include "paddle/extension.h"
 
 // The linear implemented here must be passed in bias
-std::vector<paddle::Tensor> PtenLinearForward(const paddle::Tensor& x,
-                                              const paddle::Tensor& weight,
-                                              const paddle::Tensor& bias) {
+std::vector<paddle::Tensor> PhiLinearForward(const paddle::Tensor& x,
+                                             const paddle::Tensor& weight,
+                                             const paddle::Tensor& bias) {
   return {
       paddle::experimental::add(paddle::experimental::matmul(x, weight), bias)};
 }
@@ -90,6 +90,6 @@ std::vector<paddle::DataType> LinearInferDtype(
 PD_BUILD_OP(pten_linear)
     .Inputs({"X", "Weight", "Bias"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(PtenLinearForward))
+    .SetKernelFn(PD_KERNEL(PhiLinearForward))
     .SetInferShapeFn(PD_INFER_SHAPE(LinearInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(LinearInferDtype));
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index ca18416a7a123511ee097a7a804b199015dcd2bf..2361bd270623873384d3cea8cd11eb10a78ec116 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -54,6 +54,7 @@ list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy)
 list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard)
 list(APPEND DIST_TEST_OPS test_auto_parallel_save_load)
 list(APPEND DIST_TEST_OPS test_auto_parallel_autoconvert)
+list(APPEND DIST_TEST_OPS test_collective_process_group)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -290,6 +291,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_process_group)
 elseif(WITH_GPU)
     if (${CUDNN_VERSION} VERSION_LESS 7100)
         LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -588,7 +590,10 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op)
+if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+    py_test_modules(test_warpctc_op MODULES test_warpctc_op)
+    set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
+endif()
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
@@ -712,11 +717,11 @@ if(WITH_DISTRIBUTE)
         endif()
 
         bash_test_modules(test_fleetrun START_BASH test_fleetrun.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_fleet_run_random_port START_BASH test_fleet_run_random_port.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
             bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_fleet_run_random_port START_BASH test_fleet_run_random_port.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         endif()
         if(WITH_ASCEND OR WITH_ASCEND_CL)
             bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
@@ -933,7 +938,7 @@ set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)
 set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES TIMEOUT 120)
-set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
+
 set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
@@ -1114,6 +1119,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120)
+    
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 220611be1814462544657ce2df0b2a39d81a9ad8..0a9eaf34ba512b3dd8649453e6e8d4ed25154c89 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -7,4 +7,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
     py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
+    py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd8f8f3e7083d61bd4a30ca114d0ac2a099ba47
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
@@ -0,0 +1,440 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+
+from paddle.distributed import fleet
+
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.utils import make_data_unshard
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
+from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = auto.ProcessMesh([0, 1])
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(
+            mean=0.0, std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+
+        auto.shard_tensor(
+            self.norm.weight,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+        auto.shard_tensor(
+            self.norm.bias,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, 0]
+            })
+        auto.shard_tensor(
+            self.linear0.bias,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [0]})
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [0, -1]
+            })
+        auto.shard_tensor(
+            self.linear1.bias,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+        out = self.norm(input)
+        auto.shard_tensor(
+            out,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+        out = self.linear0(out)
+        auto.shard_tensor(
+            out,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, 0]
+            })
+        out = F.gelu(out, approximate=True)
+        auto.shard_tensor(
+            out,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, 0]
+            })
+        out = self.linear1(out)
+        auto.shard_tensor(
+            out,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        return out
+
+
+def get_program():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    with fluid.program_guard(train_program, start_program):
+
+        # 循环计数器
+        i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
+        auto.shard_tensor(
+            i,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+        # 循环次数
+        loop_len = fluid.layers.fill_constant(
+            shape=[1], dtype='int64', value=epoch_num)
+        auto.shard_tensor(
+            loop_len,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+        # input
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        data_holder = [input, label]
+        # dataloader
+        dataloader = paddle.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
+        dataloader.set_batch_generator(
+            batch_generator_creator(), places=paddle.static.cuda_places())
+        # data dist_attr
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+        auto.shard_tensor(
+            label,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        mlp_start = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_start(input)
+
+        input_array = fluid.layers.array_write(pred, i)
+        auto.shard_tensor(
+            input_array,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        cond = fluid.layers.less_than(x=i, y=loop_len)
+        auto.shard_tensor(
+            cond,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+        while_op = fluid.layers.While(cond=cond)
+        with while_op.block():
+
+            pre_input = fluid.layers.array_read(array=input_array, i=i)
+            auto.shard_tensor(
+                pre_input,
+                dist_attr={
+                    "process_mesh": _g_process_mesh,
+                    "dims_mapping": [-1, -1, -1]
+                })
+
+            mlp_while = MLPLayer(
+                hidden_size=hidden_size,
+                intermediate_size=4 * hidden_size,
+                dropout_ratio=0.1,
+                initializer_range=0.02)
+            cur_pred = mlp_while(pre_input)
+
+            # 更新循环条件
+            i = fluid.layers.increment(x=i, value=1, in_place=True)
+            fluid.layers.array_write(cur_pred, array=input_array, i=i)
+            fluid.layers.less_than(x=i, y=loop_len, cond=cond)
+
+        end_pred = fluid.layers.array_read(array=input_array, i=i)
+        auto.shard_tensor(
+            end_pred,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        mlp_end = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_end(end_pred)
+
+        error_cost = paddle.nn.functional.square_error_cost(pred, label)
+        auto.shard_tensor(
+            error_cost,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        loss = paddle.mean(error_cost)
+        auto.shard_tensor(
+            loss,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+    return train_program, start_program, dataloader, i, loss
+
+
+def completion(train_program, start_program, dist_context):
+    blocks = train_program.blocks
+    # completion tensors
+    for block in blocks:
+        for op in block.ops:
+            if op.type == "layer_norm":
+                for out_name in op.output_arg_names:
+                    out_var = block.vars[out_name]
+                    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        out_var)
+                    if tensor_dist_attr:
+                        continue
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.process_mesh = _g_process_mesh
+                    tensor_dist_attr.dims_mapping = [-1]
+                    dist_context.set_tensor_dist_attr_for_program(
+                        out_var, tensor_dist_attr)
+
+            elif op.type == "elementwise_sub":
+                for out_name in op.output_arg_names:
+                    out_var = block.vars[out_name]
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.process_mesh = _g_process_mesh
+                    tensor_dist_attr.dims_mapping = [-1, -1, -1]
+                    dist_context.set_tensor_dist_attr_for_program(
+                        out_var, tensor_dist_attr)
+
+            elif op.type == "matmul_v2":
+                col = False
+                for in_name in op.input_arg_names:
+                    if ".w_" not in in_name:
+                        continue
+                    if in_name not in block.vars:
+                        in_var = blocks[0].vars[in_name]
+                    else:
+                        in_var = block.vars[in_name]
+                    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        in_var)
+                    assert tensor_dist_attr is not None
+                    if tensor_dist_attr.dims_mapping == [-1, 0]:
+                        col = True
+                for out_name in op.output_arg_names:
+                    out_var = block.vars[out_name]
+                    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        out_var)
+                    if tensor_dist_attr:
+                        continue
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.process_mesh = _g_process_mesh
+                    if col:
+                        tensor_dist_attr.dims_mapping = [-1, -1, 0]
+                    else:
+                        tensor_dist_attr.dims_mapping = [-1, -1, -1]
+                    dist_context.set_tensor_dist_attr_for_program(
+                        out_var, tensor_dist_attr)
+            elif op.type == "while":
+                out_name = op.desc.output("StepScopes")[0]
+                out_var = block.vars[out_name]
+                tensor_dist_attr = TensorDistributedAttribute()
+                tensor_dist_attr.process_mesh = _g_process_mesh
+                tensor_dist_attr.dims_mapping = [-1]
+                dist_context.set_tensor_dist_attr_for_program(out_var,
+                                                              tensor_dist_attr)
+
+    # completion ops
+    for block in blocks:
+        for op in block.ops:
+            op_dist_attr = OperatorDistributedAttribute()
+            op_dist_attr.process_mesh = _g_process_mesh
+            if op.type == "create_by_read" or op.type == "create_double_buffer_reader":
+                for in_name in op.input_arg_names:
+                    op_dist_attr.set_input_dims_mapping(in_name, [])
+                for out_name in op.output_arg_names:
+                    op_dist_attr.set_output_dims_mapping(out_name, [])
+            elif op.type == "read":
+                for in_name in op.input_arg_names:
+                    op_dist_attr.set_output_dims_mapping(in_name, [])
+                for out_name in op.output_arg_names:
+                    out_var = block.vars[out_name]
+                    out_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        out_var)
+                    op_dist_attr.set_output_dist_attr(out_name, out_dist_attr)
+            elif op.type == "while":
+                for in_name in op.input_arg_names:
+                    in_var = block.vars[in_name]
+                    in_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        in_var)
+                    op_dist_attr.set_input_dist_attr(in_name, in_dist_attr)
+                for out_name in op.output_arg_names:
+                    if out_name == op.desc.output("StepScopes")[0]:
+                        op_dist_attr.set_output_dims_mapping(out_name, [])
+                    else:
+                        out_var = block.vars[out_name]
+                        out_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                            out_var)
+                        op_dist_attr.set_output_dist_attr(out_name,
+                                                          out_dist_attr)
+            else:
+                for in_name in op.input_arg_names:
+                    if in_name == "lod_tensor_blocking_queue_0":
+                        continue
+                    if in_name not in block.vars:
+                        in_var = blocks[0].vars[in_name]
+                    else:
+                        in_var = block.vars[in_name]
+                    in_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        in_var)
+                    op_dist_attr.set_input_dist_attr(in_name, in_dist_attr)
+                for out_name in op.output_arg_names:
+                    if out_name not in block.vars:
+                        out_var = blocks[0].vars[out_name]
+                    else:
+                        out_var = block.vars[out_name]
+                    out_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        out_var)
+                    op_dist_attr.set_output_dist_attr(out_name, out_dist_attr)
+
+            if op.type == "matmul_v2":
+                op_dist_attr.impl_type = "matmul_v2"
+                for in_name in op_dist_attr.inputs_dist_attrs.keys():
+                    in_dist_attr = op_dist_attr.inputs_dist_attrs[in_name]
+                    if ".w_" in in_name and in_dist_attr.dims_mapping[-1] == 0:
+                        op_dist_attr.impl_idx = 0
+                    else:
+                        op_dist_attr.impl_idx = 1
+            else:
+                op_dist_attr.impl_type = "default"
+                op_dist_attr.impl_idx = 0
+
+            dist_context.set_op_dist_attr_for_program(op, op_dist_attr)
+            make_data_unshard(train_program, start_program, dist_context)
+
+    return train_program, start_program
+
+
+def partition(train_program, start_program, dist_context):
+
+    # optimizer = paddle.optimizer.SGD(learning_rate=0.00001)
+    rank = paddle.distributed.get_rank()
+    partitioner = Partitioner(dist_context, rank)
+    dist_main_prog, dist_startup_prog, _ = partitioner.partition(
+        train_program, start_program, [])
+
+    return dist_main_prog, dist_startup_prog
+
+
+class TestMLP(unittest.TestCase):
+    def test_partitioner(self):
+
+        train_program, start_program, dataloader, i, loss = get_program()
+        dist_context = get_default_distributed_context()
+        train_program, start_program = completion(train_program, start_program,
+                                                  dist_context)
+        dist_context.block_state.parse_forward_blocks(train_program)
+
+        dist_main_prog, dist_startup_prog = partition(
+            train_program, start_program, dist_context)
+        global_block_ops = dist_main_prog.blocks[0].ops
+        global_block_ops = [op.type for op in global_block_ops]
+        sub_block_ops = dist_main_prog.blocks[1].ops
+        sub_block_ops = [op.type for op in sub_block_ops]
+
+        self.assertTrue("c_allreduce_sum" in global_block_ops)
+        self.assertTrue("c_allreduce_sum" in sub_block_ops)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fc1ea41033e00543054aa82949a583c6b0cf00f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import time
+import paddle.fluid as fluid
+import copy
+import os
+import numpy as np
+import subprocess
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+from paddle.fluid import layers
+from paddle.io import Dataset, IterableDataset, DataLoader
+from paddle.static import InputSpec
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.engine import Engine
+
+paddle.enable_static()
+global_process_mesh = auto.ProcessMesh(mesh=[0])
+batch_size = 1
+batch_num = 10
+hidden_size = 1024
+sequence_len = 512
+image_size = hidden_size
+class_num = 10
+
+paddle.seed(44)
+
+
+class MyDataset(Dataset):
+    def __init__(self, num_samples):
+        super(MyDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        label = np.random.randint(0, class_num - 1, dtype="int64")
+        return input, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        # self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        # self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": global_process_mesh,
+                "dims_mappig": [-1]
+            })
+        # out = self.norm(input)
+        out = self.linear0(input)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        # out = self.dropout(out)
+        out = self.linear2(out)
+        return out
+
+
+class TestEngineAPI(unittest.TestCase):
+    def test_engine_api(self):
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        loss = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=None)
+
+        dataset = MyDataset(batch_num * batch_size)
+        data_spec = [
+            InputSpec([batch_size, hidden_size], 'float32', 'x'),
+            InputSpec([batch_size], 'int64', 'label')
+        ]
+
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.amp = False
+        dist_strategy.pipeline = False
+        dist_strategy.recompute = False
+        # init parallel optimizer
+        dist_strategy.semi_auto = True
+        fleet.init(is_collective=True, strategy=dist_strategy)
+
+        engine = Engine(mlp, data_spec, strategy=dist_strategy)
+        engine.prepare(optimizer, loss)
+        engine.fit(dataset,
+                   batch_size=batch_size,
+                   steps_per_epoch=batch_num * batch_size)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
index 188b51ee16174cca9bd4e4ea7498e57301b4566a..729c9c46b4f0cab2374d951b54deeaffe9cb0c1d 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -10,6 +10,10 @@ if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
     list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass")
     list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass")
     list(REMOVE_ITEM TEST_OPS "test_dist_gradient_merge_pass")
+    list(REMOVE_ITEM TEST_OPS "test_dist_inplace_addto_pass")
+    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_amp_pass")
+    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_recompute_pass")
+    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_sharding_pass")
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
index 567f266cd57b1eb4d16602b9bf7e1ee95d56bf19..ba1f5ed2b3ead7dd2be5526e18ebc82540b7ea4e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
@@ -320,10 +320,12 @@ class ListWithCondNet(paddle.nn.Layer):
 
         if index > 0:
             res = a[0] * a[0]
+            y = y + 1
         else:
             res = a[-1] * a[-1]
+            y = y - 1
 
-        z = a[-1] * res
+        z = a[-1] * res * y[0]
         return z
 
 
@@ -333,7 +335,7 @@ class TestListWithCondGradInferVarType(unittest.TestCase):
         x = paddle.to_tensor([2, 3, 4], dtype='float32')
         index = paddle.to_tensor([1])
         res = net(x, index)
-        self.assertEqual(res[0], 16.)
+        self.assertEqual(res[0], 48.)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dc0623a112f51fb74654f575db4194f06c79e5b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import MkldnnAutoScanTest
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+from functools import partial
+import unittest
+from hypothesis import given
+import hypothesis.strategies as st
+
+
+class TestMKLDNNLogSoftmaxOp(MkldnnAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self, *args, **kwargs):
+        def generate_input(*args, **kwargs):
+            return np.random.random(kwargs['in_shape']).astype(np.float32)
+
+        logsoftmax_op = OpConfig(
+            type="log_softmax",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["output_data"]},
+            attrs={"axis": kwargs['axis']})
+
+        program_config = ProgramConfig(
+            ops=[logsoftmax_op],
+            weights={},
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input,
+                                                            *args, **kwargs)),
+            },
+            outputs=["output_data"])
+
+        yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, (1e-5, 1e-5)
+
+    @given(
+        axis=st.sampled_from([-2, -1, 0, 1]),
+        in_shape=st.lists(
+            st.integers(
+                min_value=2, max_value=5), min_size=3, max_size=5))
+    def test(self, *args, **kwargs):
+        self.run_test(quant=False, *args, **kwargs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
index 57f5b5a0bb245cd80f7a3f746023d06c9960e515..f9bb4e66f2ab4071bfbd641db3fa678db67a944c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
@@ -112,7 +112,7 @@ class TrtConvertDropoutTest(TrtLayerAutoScanTest):
         def generate_trt_nodes_num(attrs, dynamic_shape):
             if attrs[0]['dropout_implementation'] == "upscale_in_train":
                 return 0, 2
-            elif self.dims == 1:
+            elif self.dims == 1 and dynamic_shape == False:
                 return 0, 3
             else:
                 return 1, 2
@@ -141,17 +141,7 @@ class TrtConvertDropoutTest(TrtLayerAutoScanTest):
                                                                      True), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(
-                    program_config.inputs['input_data'].shape
-            ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled():
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output shape has diff, but we can add shuffle layer to resolve it."
-        )
+        pass
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
index 37d23cb18d843ad11f3c893b44c4f47cb03f3aa0..9bcbbf95990f2046edd982870240c9d669f920ee 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
@@ -155,7 +155,7 @@ class TrtConvertGatherTest(TrtLayerAutoScanTest):
             if self.input_num == 3:
                 return 0, 5
             else:
-                if dynamic_shape and self.axis == 0:
+                if dynamic_shape:
                     return 1, 3
                 else:
                     return 0, 4
@@ -179,31 +179,24 @@ class TrtConvertGatherTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-3
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(self.dynamic_shape.min_input_shape) != 0:
-                inputs = program_config.inputs
-                if len(inputs['input_data'].shape) == 1 or len(inputs[
-                        'index_data'].shape) == 1:
-                    return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_SUPPORT,
-            "Need to repair the case: trt reshape out failed for dynamic shape mode when inputs' dims==1."
-        )
-
-        def teller2(program_config, predictor_config):
-            inputs = program_config.inputs
-            if "axis_data" in inputs.keys():
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_SUPPORT,
-            "Need to repair the case: trt do not support axis tensor input.")
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 < 7000:
+
+            def teller1(program_config, predictor_config):
+                if len(self.dynamic_shape.min_input_shape) != 0:
+                    inputs = program_config.inputs
+                    if len(inputs['input_data'].shape) == 1 or len(inputs[
+                            'index_data'].shape) == 1:
+                        return True
+                return False
+
+            self.add_skip_case(
+                teller1, SkipReasons.TRT_NOT_SUPPORT,
+                "Need to repair the case: trt reshape out failed for dynamic shape mode when inputs' dims==1. under trt7.0 "
+            )
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
index 0c7eae5f85f9557f6db58af8c4e6a677894ede05..6b6a9536d81bef87a44e7996ea234f4833e058df 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
@@ -346,7 +346,7 @@ class TrtConvertGatherNdTest_dim_2_2(TrtLayerAutoScanTest):
             return np.random.random([2, 32]).astype(np.float32)
 
         def generate_input2():
-            return np.ones([2, 2]).astype(np.int32)
+            return np.array([[0, 3], [1, 9]]).astype(np.int32)
 
         ops_config = [{
             "op_type": "gather_nd",
@@ -408,23 +408,11 @@ class TrtConvertGatherNdTest_dim_2_2(TrtLayerAutoScanTest):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), (0, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), 1e-5
-
-    def add_skip_trt_case(self):
-        def teller(program_config, predictor_config):
-            if len(self.dynamic_shape.min_input_shape) != 0:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller, SkipReasons.TRT_NOT_SUPPORT,
-            "Need to repair the case: the output of trt and GPU has diff when inputs' dim is 1 and 2."
-        )
+        yield self.create_inference_config(), (0, 4), 1e-5
 
     def test(self):
-        self.add_skip_trt_case()
         self.run_test()
 
 
@@ -434,10 +422,11 @@ class TrtConvertGatherNdTest_dim_3_3(TrtLayerAutoScanTest):
 
     def sample_program_configs(self):
         def generate_input1():
-            return np.random.random([2, 32, 256]).astype(np.float32)
+            return np.random.random([16, 32, 256]).astype(np.float32)
 
         def generate_input2():
-            return np.ones([2, 2, 2]).astype(np.int32)
+            return np.array(
+                [[[2, 5], [3, 8]], [[0, 2], [0, 3]]]).astype(np.int32)
 
         ops_config = [{
             "op_type": "gather_nd",
@@ -471,7 +460,7 @@ class TrtConvertGatherNdTest_dim_3_3(TrtLayerAutoScanTest):
                 "index_data": [1, 1, 1]
             }
             self.dynamic_shape.max_input_shape = {
-                "input_data": [4, 64, 512],
+                "input_data": [16, 64, 512],
                 "index_data": [4, 2, 4]
             }
             self.dynamic_shape.opt_input_shape = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
index 8913159b2c4dfcf5b7e0275c582e8f89f27e848a..c6f2fa205c713fcd4ec718764befb90e30cd1984 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
@@ -172,11 +172,11 @@ class TrtConvertMatmulTest_dynamic(TrtLayerAutoScanTest):
             }
             self.dynamic_shape.max_input_shape = {
                 "input1_data": [16, 4, 4],
-                "input2_data": [16, 4, 128]
+                "input2_data": [16, 4, 4]
             }
             self.dynamic_shape.opt_input_shape = {
                 "input1_data": [8, 4, 4],
-                "input2_data": [8, 4, 16]
+                "input2_data": [8, 4, 4]
             }
 
         attrs = [
@@ -192,17 +192,7 @@ class TrtConvertMatmulTest_dynamic(TrtLayerAutoScanTest):
         yield self.create_inference_config(), (1, 3), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(
-                    self.dynamic_shape.min_input_shape
-            ) != 0 and self.trt_param.precision == paddle_infer.PrecisionType.Half:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "Tensorrt MatrixMultiply layer will get error when dynamic shape fp16 mode."
-        )
+        pass
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index e2d50fc853887eeda86af75f6cbc6f3cc7a662cc..4e4fe69d914fadd394228740fd4866610e71b6a0 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -328,6 +328,18 @@ class TestMKLDNNMish(TestActivation):
         self.attrs = {"use_mkldnn": True}
 
 
+class TestMKLDNNRound(TestActivation):
+    def setUp(self):
+        self.op_type = "round"
+
+        x = np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype(np.float32)
+        out = np.round(x)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+
 class TestMKLDNNSigmoidDim4(TestSigmoid):
     def setUp(self):
         super(TestMKLDNNSigmoidDim4, self).setUp()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7477eaf3339b25a9c40fcf0870b55544e7cf5a2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+from paddle.fluid import core
+from paddle.fluid.tests.unittests.test_log_softmax import ref_log_softmax
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestLogSoftmaxOneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_softmax'
+        self.set_dtype()
+        self.set_shape()
+        self.set_axis()
+
+        x = np.random.uniform(0.1, 1.0, self.shape).astype(np.float32)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+
+        if self.dtype == np.uint16:
+            x = convert_float_to_uint16(x)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': True}
+
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_shape(self):
+        self.shape = [2, 3, 4, 5]
+
+    def set_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+
+class TestLogSoftmax1DOneDNNOp(TestLogSoftmaxOneDNNOp):
+    def set_shape(self):
+        self.shape = [100]
+
+
+class TestLogSoftmax3DOneDNNOp(TestLogSoftmaxOneDNNOp):
+    def set_shape(self):
+        self.shape = [12, 10, 3]
+
+
+class TestLogSoftmax5DOneDNNOp(TestLogSoftmaxOneDNNOp):
+    def set_shape(self):
+        self.shape = [2, 3, 4, 5, 6]
+
+
+class TestLogSoftmaxPositiveAxisOneDNNOp(TestLogSoftmaxOneDNNOp):
+    def set_axis(self):
+        self.axis = 2
+
+
+# BF16 TESTS
+class TestLogSoftmax1DBF16OneDNNOp(TestLogSoftmax1DOneDNNOp):
+    def set_dtype(self):
+        self.dtype = np.uint16
+
+
+class TestLogSoftmaxPositiveAxisBF16OneDNNOp(
+        TestLogSoftmaxPositiveAxisOneDNNOp):
+    def set_dtype(self):
+        self.dtype = np.uint16
+
+
+class TestLogSoftmax5DBF16OneDNNOp(TestLogSoftmax5DOneDNNOp):
+    def set_shape(self):
+        self.shape = [2, 3, 4, 5, 6]
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
index 24ebf40216f4bad6f351f865b86662ce0718f690..d72a1d53d3aa57a3ce4e61f03435eef4d1471d21 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 
 
@@ -59,6 +59,7 @@ def nearest_neighbor_interp_mkldnn_np(X,
 
 
 @skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
+@OpTestTool.skip_if_not_cpu_bf16()
 class TestNearestInterpV2MKLDNNOp(OpTest):
     def init_test_case(self):
         pass
@@ -84,7 +85,7 @@ class TestNearestInterpV2MKLDNNOp(OpTest):
         self.init_test_case()
         self.init_data_type()
 
-        if self.dtype == np.float32:
+        if self.dtype == np.float32 or self.dtype == np.uint16:
             input_np = np.random.random(self.input_shape).astype(self.dtype)
         else:
             init_low, init_high = (-5, 5) if self.dtype == np.int8 else (0, 10)
@@ -126,6 +127,9 @@ class TestNearestInterpV2MKLDNNOp(OpTest):
         if isinstance(self.scale, float):
             self.scale = [self.scale]
 
+        if self.dtype == np.uint16:
+            input_np = convert_float_to_uint16(input_np)
+
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -191,6 +195,10 @@ def create_test_class(parent):
         def init_data_type(self):
             self.dtype = np.float32
 
+    class TestBf16Case(parent):
+        def init_data_type(self):
+            self.dtype = np.uint16
+
     class TestInt8Case(parent):
         def init_data_type(self):
             self.dtype = np.int8
@@ -199,12 +207,14 @@ def create_test_class(parent):
         def init_data_type(self):
             self.dtype = np.uint8
 
-    TestFp32Case.__name__ = parent.__name__
-    TestInt8Case.__name__ = parent.__name__
-    TestUint8Case.__name__ = parent.__name__
-    globals()[parent.__name__] = TestFp32Case
-    globals()[parent.__name__] = TestInt8Case
-    globals()[parent.__name__] = TestUint8Case
+    TestFp32Case.__name__ = "{0}_{1}".format(parent.__name__, "FP32")
+    TestBf16Case.__name__ = "{0}_{1}".format(parent.__name__, "BF16")
+    TestInt8Case.__name__ = "{0}_{1}".format(parent.__name__, "INT8")
+    TestUint8Case.__name__ = "{0}_{1}".format(parent.__name__, "UINT8")
+    globals()[TestFp32Case.__name__] = TestFp32Case
+    globals()[TestBf16Case.__name__] = TestBf16Case
+    globals()[TestInt8Case.__name__] = TestInt8Case
+    globals()[TestUint8Case.__name__] = TestUint8Case
 
 
 create_test_class(TestNearestInterpV2MKLDNNOp)
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index 2e588355ce7939674fcddba58f477b19eac4dde2..c17790bd3200e2bac9841c1198572af6e1740420 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -1,9 +1,25 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+file(GLOB TEST_DIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_collective_*.py")
+string(REPLACE ".py" "" TEST_DIST_OPS "${TEST_DIST_OPS}")
 
 if (WITH_MLU)
+    foreach(TEST_OP ${TEST_DIST_OPS})
+        LIST(REMOVE_ITEM TEST_OPS ${TEST_OP})
+    endforeach(TEST_OP)
+
     foreach(TEST_OP ${TEST_OPS})
         py_test_modules(${TEST_OP} MODULES ${TEST_OP})
     endforeach(TEST_OP)
-    set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
+
+    if(WITH_CNCL)
+	foreach(TEST_OP ${TEST_DIST_OPS})
+            py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+        endforeach(TEST_OP)
+        bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+	bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+	bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+    endif(WITH_CNCL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0371e1bbb24061340209194dc720f64d1b3c39e3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base_mlu import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduce(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofallreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_allreduce_sum",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduce, "allreduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ea550a8452e4b60ec20fd3159bd78bd8d1e7368
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import paddle.fluid as fluid
+
+
+def train(prefix):
+    selected_mlus = os.getenv("FLAGS_selected_mlus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+
+    name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
+        .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
+
+    print(name)
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
+        f.write(name)
+
+
+def train_abort(prefix):
+    selected_mlus = os.getenv("FLAGS_selected_mlus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+
+    if trainer_id == 0:
+        try:
+            # train abort 
+            exit(1)
+        except SystemExit:
+            name = "abort>>> selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
+                .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
+            print(name)
+            with open(
+                    "multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                    "w") as f:
+                f.write(name)
+            raise
+    else:
+        # sleep 30s to make sure paddle.distributed.launch will terminate this process
+        time.sleep(30)
+        name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
+            .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
+
+        print(name)
+        with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                  "w") as f:
+            f.write(name)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3 and sys.argv[2] == "abort":
+        prefix = sys.argv[1]
+        train_abort(prefix)
+    else:
+        prefix = sys.argv[1]
+        train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py b/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2713532e41b2ffd6994278623153cd46163545
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+
+
+def train(prefix):
+    selected_mlus = os.getenv("FLAGS_selected_mlus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+
+    name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
+        .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
+
+    print(name)
+    with open("{}.check_{}.log".format(prefix, trainer_id), "w") as f:
+        f.write(name)
+
+
+if __name__ == '__main__':
+    prefix = sys.argv[1]
+    train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fd5db7a604d56fe427792be04d804250642dda0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCAllreduceOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_fp32(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "float32")
+
+    def test_allreduce_fp16(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "float16")
+
+    def test_allreduce_int32(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "int32")
+
+    def test_allreduce_int16(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "int16")
+
+    def test_allreduce_int8(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce", "int8")
+
+    def test_allreduce_uint8(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "uint8")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
index 2a7c64fe48972330b37ad8ab965f63f070e0ce65..4692c893d00b4595c1927f01c7e1b55dd6935c70 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -262,5 +262,13 @@ class TestDistBase(unittest.TestCase):
             need_result = input2
             self.assertTrue(np.allclose(tr0_out, need_result))
             self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc8a08c39ffc8c96f55f3367cf183448cf55d9ba
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py
@@ -0,0 +1,240 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, compiler, program_guard
+from paddle.fluid.op import Operator
+
+import sys
+sys.path.append('..')
+from op_test import OpTest, skip_check_grad_ci
+
+paddle.enable_static()
+
+
+class ElementwiseMulOp(OpTest):
+    def init_kernel_type(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', no_grad_set=set('Y'))
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x * self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1)
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100)
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 2, 11).astype(np.float32),
+            'Y': np.random.rand(10, 1, 11).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 4, 2, 3).astype(np.float32),
+            'Y': np.random.rand(10, 4, 1, 3).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOpFp16(ElementwiseMulOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(30, 3, 1, 5).astype(np.float32),
+            'Y': np.random.rand(30, 1, 4, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 10).astype(np.float32),
+            'Y': np.random.rand(2, 2, 10, 10).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y']
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_mul must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x1, y1)
+
+            # the input dtype of elementwise_mul must be float16 or float32 or int32
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..adf3019186163e98b18914958d529303d7dc8c27
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
+unset PADDLE_PORT
+export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
+export cluster_node_ips="127.0.0.1,127.0.0.2"
+export PADDLE_TRAINERS_NUM=2
+export POD_IP=127.0.0.1
+export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
+export PADDLE_TRAINER_ID=0
+
+export TRAINER_PORTS_NUM=2
+
+file_0="multi_process_fullpath_launch.check_0.log"
+file_1="multi_process_fullpath_launch.check_1.log"
+
+distributed_args="--ips=${cluster_node_ips} --mlus=0,1 --log_dir=testlog"
+
+echo "paddle.distributed.fleet.launch async poll process test"
+if ! MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process_mlu.py fullpath_launch abort; then
+    echo "train abort as planned"
+fi
+
+abort_str1="abort>>> selected_mlus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
+
+if grep -q "$abort_str1" "$file_0"; then
+    echo "trainer 0 abort as planned"
+else
+    echo "trainer 0 not abort as planned"
+    exit -1
+fi
+
+if [ ! -f $file_1 ]; then
+    echo "trainer 1 terminate as planned"
+else
+    echo "trainer 1 not terminate as planned"
+    rm $file_1
+    exit -1
+fi
+
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b93b21c1bdf6877801c1c4e99385c07fb1a894ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# use paddlecloud
+echo "begin test use paddlecloud"
+cluster_node_ips="127.0.0.1,127.0.0.2"
+export PADDLE_TRAINERS_NUM=2
+export POD_IP=127.0.0.1
+export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
+export PADDLE_TRAINER_ID=0
+
+export PADDLE_PORT=35789
+export TRAINER_PORTS_NUM=2
+
+distributed_args="--ips=${cluster_node_ips} --mlus=0,1 --log_dir=testlog"
+MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process_mlu.py fleetlaunchcloud
+
+str1="selected_mlus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
+str2="selected_mlus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
+file_0="multi_process_fleetlaunchcloud.check_0.log"
+file_1="multi_process_fleetlaunchcloud.check_1.log"
+
+echo "paddlecloud params test"
+if grep -q "$str1" "$file_0"; then
+    echo "find trainer 0"
+else
+    echo "not find trainer 0"
+    exit -1
+fi
+
+if grep -q "$str2" "$file_1"; then
+    echo "find trainer 1"
+else
+    echo "not find trainer 1"
+    exit -1
+fi
+
+# test async poll process
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
+if [ -f $file_1 ]; then
+    rm $file_1
+fi
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..722590dc87f09f67823e3eb9d95b69c9a0d29c6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+export FLAGS_START_PORT=35789
+
+export MLU_VISIBLE_DEVICES=0,1
+
+function test_nproc_0(){
+    mlus=$1
+    file_0="fleet_nproc_0.check_0.log"
+    rm -f ${file_0}
+    distributed_args="--log_dir=testlog --nproc_per_node=1"
+    python -m paddle.distributed.launch ${distributed_args} nproc_process_mlu.py  fleet_nproc_0
+
+    str0="selected_mlus:${mlus} worker_endpoints:127.0.0.1:35789 trainers_num:1 current_endpoint:127.0.0.1:35789 trainer_id:0"
+    if grep -q "$str0" "$file_0"; then
+        echo "find trainer 0"
+    else
+        echo "not find trainer 0"
+        exit -1
+    fi
+    if [ -f $file_0 ]; then
+        rm $file_0
+    fi
+}
+
+
+function test_nproc_1(){
+    file_0="fleet_nproc_1.check_0.log"
+    file_1="fleet_nproc_1.check_1.log"
+    rm -f ${file_0} ${file_1}
+
+    distributed_args="--log_dir=testlog --nproc_per_node=2"
+    python -m paddle.distributed.launch ${distributed_args} nproc_process_mlu.py  fleet_nproc_1
+
+    str0="selected_mlus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35789 trainer_id:0"
+    if grep -q "$str0" "$file_0"; then
+        echo "find trainer 0"
+    else
+        echo "not find trainer 0"
+        exit -1
+    fi
+
+    str1="selected_mlus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35790 trainer_id:1"
+    if grep -q "$str1" "$file_1"; then
+        echo "find trainer 1"
+    else
+        echo "not find trainer 1"
+        exit -1
+    fi
+    if [ -f $file_0 ]; then
+        rm $file_0
+    fi
+    if [ -f $file_1 ]; then
+        rm $file_1
+    fi
+}
+
+test_nproc_0 "0,1"
+
+test_nproc_1
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef33719d368e8b2ecb248f4e14f6bd6a031c26bb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py
@@ -0,0 +1,170 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+paddle.enable_static()
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestMLUReduceMaxOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [-1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMaxOpMultiAxises(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [-2, -1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceAll(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].max()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMaxOpWithOutDtype_int32(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.INT32)
+        }
+        self.outputs = {
+            'Out':
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(np.int32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMaxOpWithOutDtype_fp16(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.FP16)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].max(
+                axis=tuple(self.attrs['dim'])).astype(np.float16)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMaxOpWithOutDtype_fp32(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].max(
+                axis=tuple(self.attrs['dim'])).astype(np.float32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..284f8f984c232d3c178bcb13977b958ac4775a30
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py
@@ -0,0 +1,170 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+paddle.enable_static()
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestMLUReduceMinOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [-1]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMinOpMultiAxises(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [-2, -1]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceAll(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].min()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMinOpWithOutDtype_int32(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.INT32)
+        }
+        self.outputs = {
+            'Out':
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(np.int32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMinOpWithOutDtype_fp16(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.FP16)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].min(
+                axis=tuple(self.attrs['dim'])).astype(np.float16)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMinOpWithOutDtype_fp32(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].min(
+                axis=tuple(self.attrs['dim'])).astype(np.float32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index ec3b68086b06593b035b60825a52b0ec32b8281d..848ebae0706e3c62e0e0e6579cd3c04f02d43be4 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -380,7 +380,7 @@ class OpTest(unittest.TestCase):
             hasattr(self, 'output_dtype') and
             self.output_dtype == np.uint16) or (
                 hasattr(self, 'mkldnn_data_type') and
-                getattr(self, 'mkldnn_data_type') is "bfloat16") or (
+                getattr(self, 'mkldnn_data_type') == "bfloat16") or (
                     hasattr(self, 'attrs') and
                     'mkldnn_data_type' in self.attrs and
                     self.attrs['mkldnn_data_type'] == 'bfloat16')
@@ -606,8 +606,12 @@ class OpTest(unittest.TestCase):
 
             if is_input:
                 v = self._create_var_from_numpy(np_value_temp)
+
                 if if_return_inputs_grad_dict:
                     v.stop_gradient = False
+                    if _in_eager_mode():
+                        v.retain_grads()
+
                 if has_lod:
                     v.value().get_tensor().set_recursive_sequence_lengths(
                         lod_temp)
@@ -618,7 +622,6 @@ class OpTest(unittest.TestCase):
                     type=core.VarDesc.VarType.LOD_TENSOR,
                     persistable=False,
                     stop_gradient=False)
-
             return v
 
         # prepare variable for input or output
@@ -681,7 +684,6 @@ class OpTest(unittest.TestCase):
             # prepare input variable
             inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
                                                           True, False, block)
-
             # prepare output variable
             outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
@@ -1741,6 +1743,7 @@ class OpTest(unittest.TestCase):
                 for attrs_name in self.attrs:
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
             block.append_op(
                 type=self.op_type,
                 inputs=inputs,
@@ -1817,7 +1820,9 @@ class OpTest(unittest.TestCase):
                         inputs={"X": loss_sum},
                         outputs={"Out": loss},
                         attrs={'scale': 1.0 / float(len(avg_sum))})
+
                 loss.backward()
+
                 fetch_list_grad = []
                 for inputs_to_check_name in inputs_to_check:
                     a = inputs_grad_dict[inputs_to_check_name].gradient()
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d999aad63ecf41c8264f52f488757424163a4b24
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+ProcessGroupStrategy = core.ProcessGroupStrategy
+
+
+def init_process_group(strategy=None):
+    # this will remove
+    if strategy is None:
+        strategy = ProcessGroupStrategy()
+        strategy.nranks = ParallelEnv().nranks
+        strategy.local_rank = ParallelEnv().local_rank
+        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+        strategy.current_endpoint = ParallelEnv().current_endpoint
+    if strategy.nranks < 2:
+        return
+
+    pg_group = core.ProcessGroupNCCL(strategy, strategy.local_rank,
+                                     strategy.nranks)
+
+    return pg_group
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_nccl(self):
+        with _test_eager_guard():
+            paddle.set_device('gpu:%d' %
+                              paddle.distributed.ParallelEnv().dev_id)
+
+            pg = init_process_group()
+            print("rank:", pg.rank(), "size:", pg.size(), "name:", pg.name())
+            print("test new group api ok")
+
+            # test allreduce sum
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            # test allreduce max
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if pg.rank() == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_y)
+
+            print("test broadcast api ok")
+
+
+class TestProcessGroupFp16(TestProcessGroupFp32):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float16"
+        self.shape = (4, 20, 20)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index 52397f51321f585784b52c4a39bd707cf97f7dc4..96ab0aecb75850de51e58e6d6a26271e54f800b4 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -158,6 +158,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
index 27de9f325063b05e8dd17b79c501d944a6e42d2b..29575dc76c2a1c6bdcdca4a42671f84196fe0a89 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
@@ -47,9 +47,7 @@ def get_dist_prog(train_program,
     complete_train_program = completer.complete_forward_annotation(
         train_program
     ) if complete_train_program is None else complete_train_program
-
-    # parallelizer._apply_serial_forward_pass(complete_train_program,
-    #                                         startup_program)
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,
@@ -95,9 +93,9 @@ class TestDistributedTensor(unittest.TestCase):
         rank_id = 1
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        dist_main_prog, dist_startup_prog, _ = get_dist_prog(
-            train_program, startup_program, dist_context, rank_id,
-            complete_train_program)
+        dist_context = DistributedContext()
+        dist_main_prog, dist_startup_prog, complete_train_program = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id, None)
         dist_context.dist_main_programs[rank_id] = dist_main_prog
         dist_context.dist_startup_programs[rank_id] = dist_startup_prog
         name = "layer_norm_1.tmp_2"
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 8869fd6a59e3772507aa6413afd7c872bab7a533..36a34815b681aa2de543061d62ea12493830d714 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -486,7 +486,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
-
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
index deff2144411fccbff90a22f6639bc252da866d82..ef8780a020f33bb056fd2d596538fe44a5600492 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -53,6 +53,7 @@ def get_programs(annotated_func):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
 
     rank_id = 3
     dist_strategy = fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index 01e62d886e2b7cb9fd7f71bae3b775e0698265ab..d0bed73f1b8c4e0585b096e3e1a21d49aee5a698 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -885,6 +885,7 @@ class TestGPTPartitioner(unittest.TestCase):
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
+        dist_context.block_state.parse_forward_blocks(complete_train_program)
 
         # serial backward pass
         params_grads = parallelizer._generate_backward(
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index 1d8938785924cfadfdb232aeeb42b7af045af09a..1278ed68d959e4f076fec2f6077c47437a12c300 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -160,7 +160,7 @@ def get_dist_prog(train_program,
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
-
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
     if change_process_mesh:
         global PP_MESH_1
         dist_context.get_tensor_dist_attr_for_program(
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index 5a79d1f9514ab2c8ce1f6de7956653df463a1f9d..e84cb68f437caa848e43921fda19ccc4b722a821 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -120,7 +120,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
-
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index 6696a9d3006d2bdec61b14fc49a639060d5fa4cd..0636c083e54e00c6386fbbf7a4d93da222219287 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -136,7 +136,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
-
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
@@ -269,6 +269,7 @@ class TestMLPReshard(unittest.TestCase):
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
+        dist_context.block_state.parse_forward_blocks(complete_train_program)
         partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition(
             complete_train_program, startup_program, [])
         reshard(partitioned_main_prog, partitioned_startup_prog, rank_id,
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 471caeb77bf655645fa7e8c600b8c8f3d8bef5b9..426d5d463f4530e7662279db83fe29826d51d775 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -18,6 +18,7 @@ import unittest
 import paddle
 from op_test import OpTest
 import numpy as np
+import os
 
 
 def output_hist(out):
@@ -68,5 +69,43 @@ class TestBernoulliApi(unittest.TestCase):
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(100)
+        np.random.seed(100)
+
+        x_np = np.random.rand(32, 1024, 1024)
+
+        x = paddle.to_tensor(x_np, dtype='float64')
+        y = paddle.bernoulli(x).numpy()
+        index0, index1, index2 = np.nonzero(y)
+        self.assertEqual(np.sum(index0), 260028995)
+        self.assertEqual(np.sum(index1), 8582429431)
+        self.assertEqual(np.sum(index2), 8581445798)
+        expect = [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]
+        self.assertTrue(np.array_equal(y[16, 500, 500:510], expect))
+
+        x = paddle.to_tensor(x_np, dtype='float32')
+        y = paddle.bernoulli(x).numpy()
+        index0, index1, index2 = np.nonzero(y)
+        self.assertEqual(np.sum(index0), 260092343)
+        self.assertEqual(np.sum(index1), 8583509076)
+        self.assertEqual(np.sum(index2), 8582778540)
+        expect = [0., 0., 1., 1., 1., 1., 0., 1., 1., 1.]
+        self.assertTrue(np.array_equal(y[16, 500, 500:510], expect))
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae5424a882daea54145a31612f61909871fe05c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestProcessGroup(TestMultipleGpus):
+    def test_process_group_nccl(self):
+        self.run_mnist_2gpu('process_group_nccl.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eigh_op.py b/python/paddle/fluid/tests/unittests/test_eigh_op.py
index 8e8c9df199f142e167060d4cd24611d8fa2221f5..3e8230e5d0c625f8ae066a74449894609fb54ef6 100644
--- a/python/paddle/fluid/tests/unittests/test_eigh_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigh_op.py
@@ -59,8 +59,12 @@ class TestEighGPUCase(unittest.TestCase):
         self.dtype = "float32"
         np.random.seed(123)
         self.x_np = np.random.random(self.x_shape).astype(self.dtype)
-        self.rtol = 1e-5
-        self.atol = 1e-5
+        if (paddle.version.cuda() >= "11.6"):
+            self.rtol = 5e-6
+            self.atol = 6e-5
+        else:
+            self.rtol = 1e-5
+            self.atol = 1e-5
 
     def test_check_output_gpu(self):
         if paddle.is_compiled_with_cuda():
@@ -79,23 +83,30 @@ class TestEighGPUCase(unittest.TestCase):
 
 class TestEighAPI(unittest.TestCase):
     def setUp(self):
-        self.init_input_shape()
-        self.dtype = "float32"
+        self.init_input_data()
         self.UPLO = 'L'
-        self.rtol = 1e-6
-        self.atol = 1e-6
+        if (paddle.version.cuda() >= "11.6"):
+            self.rtol = 5e-6
+            self.atol = 6e-5
+        else:
+            self.rtol = 1e-5
+            self.atol = 1e-5
         self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
             else paddle.CPUPlace()
         np.random.seed(123)
+
+    def init_input_data(self):
+        self.x_shape = [5, 5]
+        self.dtype = "float32"
         self.real_data = np.random.random(self.x_shape).astype(self.dtype)
-        self.complex_data = np.random.random(self.x_shape).astype(
+        complex_data = np.random.random(self.x_shape).astype(
             self.dtype) + 1J * np.random.random(self.x_shape).astype(self.dtype)
         self.trans_dims = list(range(len(self.x_shape) - 2)) + [
             len(self.x_shape) - 1, len(self.x_shape) - 2
         ]
-
-    def init_input_shape(self):
-        self.x_shape = [5, 5]
+        #build a random conjugate matrix
+        self.complex_symm = np.divide(
+            complex_data + np.conj(complex_data.transpose(self.trans_dims)), 2)
 
     def compare_result(self, actual_w, actual_v, expected_w, expected_v):
         np.testing.assert_allclose(
@@ -129,9 +140,9 @@ class TestEighAPI(unittest.TestCase):
             exe = paddle.static.Executor(self.place)
             expected_w, expected_v = exe.run(
                 main_prog,
-                feed={"input_x": self.complex_data},
+                feed={"input_x": self.complex_symm},
                 fetch_list=[output_w, output_v])
-            actual_w, actual_v = np.linalg.eigh(self.complex_data)
+            actual_w, actual_v = np.linalg.eigh(self.complex_symm)
             self.compare_result(actual_w, actual_v, expected_w, expected_v)
 
     def test_in_static_mode(self):
@@ -146,14 +157,14 @@ class TestEighAPI(unittest.TestCase):
         actual_w, actual_v = paddle.linalg.eigh(input_real_data)
         self.compare_result(actual_w, actual_v.numpy(), expected_w, expected_v)
 
-        input_complex_data = paddle.to_tensor(self.complex_data)
-        expected_w, expected_v = np.linalg.eigh(self.complex_data)
+        input_complex_data = paddle.to_tensor(self.complex_symm)
+        expected_w, expected_v = np.linalg.eigh(self.complex_symm)
         actual_w, actual_v = paddle.linalg.eigh(input_complex_data)
         self.compare_result(actual_w, actual_v.numpy(), expected_w, expected_v)
 
     def test_eigh_grad(self):
         paddle.disable_static()
-        x = paddle.to_tensor(self.complex_data, stop_gradient=False)
+        x = paddle.to_tensor(self.complex_symm, stop_gradient=False)
         w, v = paddle.linalg.eigh(x)
         (w.sum() + paddle.abs(v).sum()).backward()
         np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index d067a2bd577880a58e757a422c52058661b4eedb..d1d391a3949ead28697c0756803e873c41914079 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
@@ -98,6 +98,46 @@ class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
                     place, atol=1e-3, check_dygraph=(self.use_mkldnn == False))
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+class TestBF16ElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.dtype = np.uint16
+
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        self.out = np.add(self.x, self.y)
+
+        self.axis = -1
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)),
+            'Y':
+            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y))
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': False}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y'))
+
+
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index 32860a6694a893d494edacc4115e156e59ff4c15..a43e56b0815a69d5f575df11092c0d1231d07cb1 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class ElementwiseDivOp(OpTest):
@@ -55,6 +55,42 @@ class ElementwiseDivOp(OpTest):
         pass
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+class TestElementwiseDivOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.dtype = np.uint16
+
+        x = np.random.uniform(0.1, 1, [12, 13]).astype(np.float32)
+        y = np.random.uniform(0.1, 1, [12, 13]).astype(np.float32)
+
+        out = np.divide(x, y)
+
+        self.inputs = {
+            'X': convert_float_to_uint16(x),
+            'Y': convert_float_to_uint16(y)
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y'))
+
+
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseDivOp_scalar(ElementwiseDivOp):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 7bace9bc535243194e2ed9ca82db49e6d1b4f2f4..00967cb503fe5fd677839a869798964bb5fb0b71 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -23,7 +23,7 @@ import paddle.fluid.core as core
 from paddle.fluid import Program, compiler, program_guard
 from paddle.fluid.op import Operator
 
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class ElementwiseMulOp(OpTest):
@@ -83,6 +83,39 @@ class ElementwiseMulOp(OpTest):
         pass
 
 
+class TestBF16ElementwiseMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.dtype = np.uint16
+
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        self.out = np.multiply(self.x, self.y)
+
+        self.axis = -1
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)),
+            'Y':
+            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': False}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMulOp_scalar(ElementwiseMulOp):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index 2594c96eebd69fcdd88d48e793e48d854b79535a..6801a4bc5f30b4829e8e9ceae201ab050b30758e 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -17,7 +17,8 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class TestElementwiseOp(OpTest):
@@ -44,6 +45,33 @@ class TestElementwiseOp(OpTest):
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
 
 
+class TestBF16ElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        out = x - y
+
+        self.inputs = {
+            'X': convert_float_to_uint16(x),
+            'Y': convert_float_to_uint16(y)
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseSubOp_scalar(TestElementwiseOp):
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
index 7d43ebadf41bbd0193fc596359c2914149bc758d..ccbc0a1676302b4c29b524601930cc855847e0fc 100644
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -16,6 +16,7 @@ import unittest
 import paddle
 import numpy as np
 from op_test import OpTest
+import os
 
 paddle.enable_static()
 paddle.seed(100)
@@ -90,18 +91,18 @@ class TestExponentialAPI(unittest.TestCase):
         self.assertTrue(np.min(x.numpy()) >= 0)
         paddle.enable_static()
 
-    # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
     def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
             return
 
-        # Note(zhouwei): The Number of threads is determined by 
-        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
-        # GPU have different number of threads, which result in different 
-        # random value. Only test on V100 GPU here.
+        # Different GPU generatte different random value. Only test V100 here.
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
         print("Test Fixed Random number on V100 GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
index a74b4f0d224ef6c165cfadc785f1de9c50d8de4a..2d4fe92f05156cdc97d81de80abd8332a15540a8 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
@@ -58,6 +58,19 @@ class TestDistModelTensor(unittest.TestCase):
         self.assertEqual(dist_tensor_float.as_ndarray().ravel().tolist(),
                          tensor_float.ravel().tolist())
 
+        tensor_float_16 = np.random.randn(20, 2).astype('float16')
+        dist_tensor_float_16 = DistModelTensor(tensor_float_16,
+                                               'float_tensor_16')
+        self.assertEqual(dist_tensor_float_16.dtype, DistModelDataType.FLOAT16)
+        self.assertEqual(
+            dist_tensor_float_16.data.tolist('float16'),
+            tensor_float_16.ravel().tolist())
+        self.assertEqual(dist_tensor_float_16.data.length(), 40 * 2)
+        self.assertEqual(dist_tensor_float_16.name, 'float_tensor_16')
+        dist_tensor_float_16.data.reset(tensor_float_16)
+        self.assertEqual(dist_tensor_float_16.as_ndarray().ravel().tolist(),
+                         tensor_float_16.ravel().tolist())
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 70ab1cc523507edcdfb361beaaa7b44742ba10cd..31caf4bd6be984e2d1579f7756c8ba0979db832f 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import numpy as np
 import paddle
@@ -21,7 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 import paddle
 
 
@@ -293,13 +294,13 @@ class TestRandomValue(unittest.TestCase):
         if not paddle.is_compiled_with_cuda():
             return
 
-        # Note(zhouwei): The Number of threads is determined by 
-        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
-        # GPU have different number of threads, which result in different 
-        # random value. Only test on V100 GPU here.
+        # Different GPU generatte different random value. Only test V100 here.
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
         def _check_random_value(dtype, expect, expect_mean, expect_std):
             x = paddle.randn([32, 3, 1024, 1024], dtype=dtype)
             actual = x.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py b/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
index 815598d9017665291878d43c6f1195d7681214f8..a429717bdaf37b3724820d3e074c38a216634cdf 100644
--- a/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
+++ b/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
@@ -19,13 +19,13 @@ from paddle import compat as cpt
 
 
 class TestGetAllRegisteredOpKernels(unittest.TestCase):
-    # reshape kernel is in fluid while not in pten
-    def test_pten_kernels(self):
-        self.assertTrue(core._get_all_register_op_kernels('pten')['sign'])
+    # reshape kernel is in fluid while not in phi
+    def test_phi_kernels(self):
+        self.assertTrue(core._get_all_register_op_kernels('phi')['sign'])
         with self.assertRaises(KeyError):
-            core._get_all_register_op_kernels('pten')['reshape']
+            core._get_all_register_op_kernels('phi')['reshape']
 
-    # sign kernel is removed from fluid and added into pten
+    # sign kernel is removed from fluid and added into phi
     def test_fluid_kernels(self):
         self.assertTrue(core._get_all_register_op_kernels('fluid')['reshape'])
         with self.assertRaises(KeyError):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 306c6b4707e8a3d7386bd8af3e32e55d09d563c4..5cb72512f99af7b4948e9fe4c01e9b993c1e247e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -20,6 +20,7 @@ import six
 from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting
 import paddle.nn as nn
 from paddle.static import InputSpec
+from paddle.autograd import PyLayer
 
 if fluid.core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
@@ -1130,20 +1131,55 @@ class TestBf16(unittest.TestCase):
     test amp for BF16 
     '''
 
-    def train(self, enable_amp=True):
+    def train(self, enable_amp=True, amp_level='O1'):
         paddle.seed(100)
         input = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
         conv = paddle.nn.Conv2D(4, 6, (3, 3))
         with paddle.amp.auto_cast(
-                enable=enable_amp, level='O2', dtype='bfloat16'):
+                enable=enable_amp, level=amp_level, dtype='bfloat16'):
             output = conv(input)
         output = output.cast('float32')
         return output.numpy()
 
     def test_bf16(self):
-        out_fp32 = self.train(enable_amp=False)
-        out_bf16 = self.train(enable_amp=True)
-        self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-2))
+        if fluid.core.is_compiled_with_cuda():
+            cudnn_version = paddle.device.get_cudnn_version()
+            if cudnn_version is not None and cudnn_version >= 8100:
+                out_fp32 = self.train(enable_amp=False)
+                out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
+                out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
+                self.assertTrue(
+                    np.allclose(
+                        out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
+                self.assertTrue(
+                    np.allclose(
+                        out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
+
+
+class TestPyLayerWithAmp(unittest.TestCase):
+    def test_pylayer(self):
+        class MyMM(PyLayer):
+            @staticmethod
+            def forward(ctx, a, b):
+                ctx.save_for_backward(a, b)
+                return a.mm(b)
+
+            @staticmethod
+            def backward(ctx, grad):
+                a, b = ctx.saved_tensor()
+                # NOTE(zhiqiu): a and b is float32 now, while grad is fp16 when forward runs with auto_cast()
+                # thus, the mm operation raise errors because of the dtype of inputs are inconsistent
+                return grad.mm(b.t()), a.t().mm(grad)
+
+        x = paddle.rand([10, 10])
+        y = paddle.rand([10, 10])
+        x.stop_gradient = False
+        y.stop_gradient = False
+
+        with paddle.amp.auto_cast():
+            res = MyMM.apply(x, y)
+            loss = paddle.mean(res)
+        loss.backward()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index a36b10f58ffaa503b6ccca580843f07b4bbfc2ac..4734e67140a8d0c05b92556c987208280c4a76de 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -953,7 +953,7 @@ class TestMetaclass(unittest.TestCase):
         self.assertNotEqual(type(MyLayer).__name__, 'pybind11_type')
         if core._in_eager_mode():
             self.assertEqual(
-                type(paddle.fluid.core.eager.Tensor).__name__, 'pybind11_type')
+                type(paddle.fluid.core.eager.Tensor).__name__, 'type')
         else:
             self.assertEqual(
                 type(paddle.fluid.core.VarBase).__name__, 'pybind11_type')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_framework.py b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
index 68628918391cb3a152cf52b0b49dab8235e46756..2d900d65976e70d1c4457bc432f8b7139d414c5c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_framework.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
@@ -18,6 +18,7 @@ import unittest
 import paddle.fluid as fluid
 import numpy as np
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 
 class MLP(fluid.Layer):
@@ -46,7 +47,7 @@ class MLP(fluid.Layer):
 
 
 class TestDygraphFramework(unittest.TestCase):
-    def test_dygraph_backward(self):
+    def func_test_dygraph_backward(self):
         with new_program_scope():
             mlp = MLP(input_size=2)
             var_inp = fluid.layers.data(
@@ -59,8 +60,18 @@ class TestDygraphFramework(unittest.TestCase):
             except AssertionError as e:
                 self.assertTrue((e is not None))
 
-    def test_dygraph_to_string(self):
+    def test_dygraph_backward(self):
+        with _test_eager_guard():
+            self.func_test_dygraph_backward()
+        self.func_test_dygraph_backward()
+
+    def func_test_dygraph_to_string(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with fluid.dygraph.guard():
             var_inp = fluid.dygraph.to_variable(np_inp)
             print(str(var_inp))
+
+    def test_dygraph_to_string(self):
+        with _test_eager_guard():
+            self.func_test_dygraph_to_string()
+        self.func_test_dygraph_to_string()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 189745e7295a8e234fd689c724c0953981c33c16..39b7f941c4bba6115994d466148958045a4b43f6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -25,6 +25,7 @@ from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid import Conv2D, Pool2D, Linear
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.framework import _test_eager_guard
 
 
 class Discriminator(fluid.Layer):
@@ -54,7 +55,7 @@ class Generator(fluid.Layer):
 
 
 class TestDygraphGAN(unittest.TestCase):
-    def test_gan_float32(self):
+    def func_test_gan_float32(self):
         seed = 90
         paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
@@ -227,6 +228,11 @@ class TestDygraphGAN(unittest.TestCase):
         for k, v in six.iteritems(dy_params2):
             self.assertTrue(np.allclose(v, static_params[k]))
 
+    def test_gan_float32(self):
+        with _test_eager_guard():
+            self.func_test_gan_float32()
+        self.func_test_gan_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index 1e509960c076339d2d56ccfcdd7a795fa462ca82..e2214532008952fe4ad11e09d9001a65a376b90a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
@@ -114,7 +115,7 @@ class TestImperativeMnist(unittest.TestCase):
 
         return _reader_imple
 
-    def test_mnist_float32(self):
+    def func_test_mnist_float32(self):
         seed = 90
         epoch_num = 1
         batch_size = 128
@@ -152,7 +153,7 @@ class TestImperativeMnist(unittest.TestCase):
                     label = data[1]
                     label.stop_gradient = True
 
-                    if batch_id % 10 == 0:
+                    if batch_id % 10 == 0 and not _in_eager_mode():
                         cost, traced_layer = paddle.jit.TracedLayer.trace(
                             mnist, inputs=img)
                         if program is not None:
@@ -257,6 +258,11 @@ class TestImperativeMnist(unittest.TestCase):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
 
+    def test_mnist_float32(self):
+        with _test_eager_guard():
+            self.func_test_mnist_float32()
+        self.func_test_mnist_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
index bda1958c0f3544bef51e51cf418ae6c07bdd7056..8e3cbaf9488bd787547f1d634c5c9e02db5382fc 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
@@ -26,10 +26,11 @@ from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from test_imperative_mnist import MNIST
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestImperativeMnistSortGradient(unittest.TestCase):
-    def test_mnist_sort_gradient_float32(self):
+    def func_test_mnist_sort_gradient_float32(self):
         seed = 90
         epoch_num = 1
 
@@ -144,6 +145,11 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.allclose(value, dy_param_value2[key], atol=1e-5))
 
+    def test_mnist_sort_gradient_float32(self):
+        with _test_eager_guard():
+            self.func_test_mnist_sort_gradient_float32()
+        self.func_test_mnist_sort_gradient_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 1c183a8c2b74aab3539607f845f89df28af19df1..7f2ce131a05d776687f9cb9f5e8d3cc533dfac25 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -27,6 +27,7 @@ from test_imperative_base import new_program_scope
 import numpy as np
 import six
 from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class SimpleLSTMRNN(fluid.Layer):
@@ -210,10 +211,15 @@ class PtbModel(fluid.Layer):
 
 
 class TestDygraphPtbRnn(unittest.TestCase):
-    def test_ptb_rnn(self):
+    def func_test_ptb_rnn(self):
         for is_sparse in [True, False]:
             self.ptb_rnn_cpu_float32(is_sparse)
 
+    def test_ptb_rnn(self):
+        with _test_eager_guard():
+            self.func_test_ptb_rnn()
+        self.func_test_ptb_rnn()
+
     def ptb_rnn_cpu_float32(self, is_sparse):
         seed = 90
         hidden_size = 10
@@ -260,7 +266,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                if i % 5 == 0:
+                if i % 5 == 0 and (not _in_eager_mode()):
                     outs, traced_layer = TracedLayer.trace(
                         ptb_model, [x, y, init_hidden, init_cell])
                     outs_static = traced_layer([x, y, init_hidden, init_cell])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 2d67af82de87a61f9149c2d09a0ad1cb743b8138..32e4aacf880e9f4e27c852925a0d397b74abbbcd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -26,6 +26,7 @@ from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
 from paddle.fluid.dygraph import TracedLayer
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 #NOTE(zhiqiu): run with FLAGS_cudnn_deterministic=1
 
@@ -242,7 +243,7 @@ class TestDygraphResnet(unittest.TestCase):
 
         return _reader_imple
 
-    def test_resnet_float32(self):
+    def func_test_resnet_float32(self):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
@@ -259,14 +260,9 @@ class TestDygraphResnet(unittest.TestCase):
                 train_parameters, parameter_list=resnet.parameters())
             np.random.seed(seed)
 
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(
-                        paddle.dataset.flowers.train(use_xmap=False)),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
 
             dy_param_init_value = {}
             for param in resnet.parameters():
@@ -275,16 +271,21 @@ class TestDygraphResnet(unittest.TestCase):
             helper = DyGraphProgramDescTracerTestHelper(self)
             program = None
 
-            for batch_id, data in enumerate(batch_py_reader()):
+            for batch_id, data in enumerate(train_reader()):
                 if batch_id >= batch_num:
                     break
 
-                img = data[0]
-                label = data[1]
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    batch_size, 1)
+
+                img = to_variable(dy_x_data)
+                label = to_variable(y_data)
                 label.stop_gradient = True
 
                 out = None
-                if batch_id % 5 == 0:
+                if batch_id % 5 == 0 and not _in_eager_mode():
                     out, traced_layer = TracedLayer.trace(resnet, img)
                     if program is not None:
                         self.assertTrue(
@@ -430,6 +431,11 @@ class TestDygraphResnet(unittest.TestCase):
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
+    def test_resnet_float32(self):
+        with _test_eager_guard():
+            self.func_test_resnet_float32()
+        self.func_test_resnet_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
index 13570d1bf71a56b6c074bc36e310cfebe92d97ba..daa778288ddf5488151a3adeed7d088a7240ded8 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -22,6 +22,7 @@ from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from test_imperative_resnet import ResNet
+from paddle.fluid.framework import _test_eager_guard
 
 batch_size = 8
 train_parameters = {
@@ -71,7 +72,7 @@ def optimizer_setting(params, parameter_list=None):
 
 
 class TestDygraphResnetSortGradient(unittest.TestCase):
-    def test_resnet_sort_gradient_float32(self):
+    def func_test_resnet_sort_gradient_float32(self):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
@@ -230,6 +231,11 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
+    def test_resnet_sort_gradient_float32(self):
+        with _test_eager_guard():
+            self.func_test_resnet_sort_gradient_float32()
+        self.func_test_resnet_sort_gradient_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 6c6b164bdec68d119738b6df96ad2468c7ccb4ff..160c94a549c910166ab0b101145eb62d23cd9039 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -27,6 +27,7 @@ from test_imperative_base import new_program_scope
 import numpy as np
 import six
 import paddle
+from paddle.fluid.framework import _test_eager_guard
 
 
 class SimpleLSTMRNN(fluid.Layer):
@@ -208,7 +209,7 @@ class PtbModel(fluid.Layer):
 
 
 class TestDygraphPtbRnn(unittest.TestCase):
-    def setUp(self):
+    def func_setUp(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -277,7 +278,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             self.opti_dict = adam.state_dict()
             self.base_opti = {}
             for k, v in self.opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.base_opti[v.name] = v.numpy()
                     self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
                 else:
@@ -294,7 +295,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             fluid.save_dygraph(self.state_dict, "./test_dy")
 
-    def testLoadAndSetVarBase(self):
+    def func_testLoadAndSetVarBase(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -363,7 +364,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     var = v.value().get_tensor()
                     var.set(np.zeros_like(np_t), place)
@@ -374,11 +375,12 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 adam._learning_rate.step_num = 0
 
             para_state_dict, opti_state_dict = fluid.load_dygraph("./test_dy")
+            print(opti_state_dict.keys())
             adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -403,7 +405,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetVariable(self):
+    def func_testSetVariable(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -472,7 +474,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     var = v.value().get_tensor()
                     var.set(np.zeros_like(np_t), place)
@@ -485,7 +487,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             adam.set_state_dict(self.opti_dict)
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -510,7 +512,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetNumpy(self):
+    def func_testSetNumpy(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -580,7 +582,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             np_opti_dict = {}
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     np_opti_dict[v.name] = np_t
                     var = v.value().get_tensor()
@@ -596,7 +598,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -623,7 +625,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetVariableBeforeTrain(self):
+    def func_testSetVariableBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -700,7 +702,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testLoadAndSetVarBaseBeforeTrain(self):
+    def func_testLoadAndSetVarBaseBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -791,7 +793,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetNumpyBeforeTrain(self):
+    def func_testSetNumpyBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -840,7 +842,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             np_state_dict = {}
 
             for k, v in self.opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_opti_dict[v.name] = v.numpy()
                 else:
                     np_opti_dict[k] = v
@@ -894,7 +896,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testOnlyLoadParams(self):
+    def func_testOnlyLoadParams(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -911,7 +913,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
-    def test_load_compatible_with_keep_name_table(self):
+    def func_test_load_compatible_with_keep_name_table(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -922,6 +924,27 @@ class TestDygraphPtbRnn(unittest.TestCase):
             self.assertTrue(para_state_dict != None)
             self.assertTrue(opti_state_dict == None)
 
+    def test_main(self):
+        self.func_setUp()
+        self.func_testLoadAndSetVarBase()
+        self.func_testSetVariable()
+        self.func_testSetNumpy()
+        self.func_testSetVariableBeforeTrain()
+        self.func_testLoadAndSetVarBaseBeforeTrain()
+        self.func_testSetNumpyBeforeTrain()
+        self.func_testOnlyLoadParams()
+        self.func_test_load_compatible_with_keep_name_table()
+        with _test_eager_guard():
+            self.func_setUp()
+            self.func_testLoadAndSetVarBase()
+            self.func_testSetVariable()
+            self.func_testSetNumpy()
+            self.func_testSetVariableBeforeTrain()
+            self.func_testLoadAndSetVarBaseBeforeTrain()
+            self.func_testSetNumpyBeforeTrain()
+            self.func_testOnlyLoadParams()
+            self.func_test_load_compatible_with_keep_name_table()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 9f0dcdb4d8f0c2e6da5baf31c970431ae261d698..7e7b2e2fd52062867b892a019a7f5304968d305a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -27,6 +27,7 @@ from test_imperative_base import new_program_scope
 import numpy as np
 import six
 import paddle
+from paddle.fluid.framework import _test_eager_guard
 
 
 class SimpleLSTMRNN(fluid.Layer):
@@ -208,7 +209,7 @@ class PtbModel(fluid.Layer):
 
 
 class TestDygraphPtbRnn(unittest.TestCase):
-    def setUp(self):
+    def func_setUp(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -279,7 +280,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             self.opti_dict = adam.state_dict()
             self.base_opti = {}
             for k, v in self.opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.base_opti[v.name] = v.numpy()
                     self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
                 else:
@@ -296,7 +297,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             paddle.save(self.state_dict, "./test_dy_v2.pdparams")
 
-    def testLoadAndSetVarBase(self):
+    def func_testLoadAndSetVarBase(self):
         self.setUp()
         seed = 90
         hidden_size = 10
@@ -367,7 +368,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     var = v.value().get_tensor()
                     var.set(np.zeros_like(np_t), place)
@@ -380,7 +381,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -405,7 +406,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetVariable(self):
+    def func_testSetVariable(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -475,7 +476,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     var = v.value().get_tensor()
                     var.set(np.zeros_like(np_t), place)
@@ -488,7 +489,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             adam.set_state_dict(self.opti_dict)
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -513,7 +514,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetNumpy(self):
+    def func_testSetNumpy(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -584,7 +585,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             np_opti_dict = {}
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     np_opti_dict[v.name] = np_t
                     var = v.value().get_tensor()
@@ -600,7 +601,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -627,7 +628,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetVariableBeforeTrain(self):
+    def func_testSetVariableBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -706,7 +707,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testLoadAndSetVarBaseBeforeTrain(self):
+    def func_testLoadAndSetVarBaseBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -797,7 +798,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetNumpyBeforeTrain(self):
+    def func_testSetNumpyBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -846,7 +847,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             np_state_dict = {}
 
             for k, v in self.opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_opti_dict[v.name] = v.numpy()
                 else:
                     np_opti_dict[k] = v
@@ -902,7 +903,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testOnlyLoadParams(self):
+    def func_testOnlyLoadParams(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -911,7 +912,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             para_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-    def test_no_state_in_input_dict(self):
+    def func_test_no_state_in_input_dict(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -923,7 +924,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             emb.set_state_dict(para_state_dict)
 
-    def test_state_shape_mismatch(self):
+    def func_test_state_shape_mismatch(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -936,6 +937,29 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             emb.set_state_dict(para_state_dict)
 
+    def test_main(self):
+        self.func_setUp()
+        self.func_testLoadAndSetVarBase()
+        self.func_testSetVariable()
+        self.func_testSetNumpy()
+        self.func_testSetVariableBeforeTrain()
+        self.func_testLoadAndSetVarBaseBeforeTrain()
+        self.func_testSetNumpyBeforeTrain()
+        self.func_testOnlyLoadParams()
+        self.func_test_no_state_in_input_dict()
+        self.func_test_state_shape_mismatch()
+        with _test_eager_guard():
+            self.func_setUp()
+            self.func_testLoadAndSetVarBase()
+            self.func_testSetVariable()
+            self.func_testSetNumpy()
+            self.func_testSetVariableBeforeTrain()
+            self.func_testLoadAndSetVarBaseBeforeTrain()
+            self.func_testSetNumpyBeforeTrain()
+            self.func_testOnlyLoadParams()
+            self.func_test_no_state_in_input_dict()
+            self.func_test_state_shape_mismatch()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index 0dd6c9f893e2a78dff9f77617853b3d8e35a6648..d1437ca9c96f1ba5fd2b9e1e420f91414d4f923a 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -14,7 +14,7 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 import paddle
 import paddle.nn.functional as F
 
diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py
index dc4dc3284e923eac17be2309c1509760dc95d03f..2123d4e0e7e35984f01b39633b76cb2c6337bb50 100644
--- a/python/paddle/fluid/tests/unittests/test_poisson_op.py
+++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py
@@ -17,6 +17,7 @@ import paddle
 import numpy as np
 from op_test import OpTest
 import math
+import os
 
 paddle.enable_static()
 paddle.seed(100)
@@ -101,11 +102,15 @@ class TestPoissonAPI(unittest.TestCase):
         self.assertTrue(np.min(y.numpy()) >= 0)
         paddle.enable_static()
 
-    # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
     def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
             return
 
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
         paddle.seed(2021)
diff --git a/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
new file mode 100644
index 0000000000000000000000000000000000000000..a58d7d35807c666ff16f50a1fe88d5064b10d161
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle_bfloat import bfloat16
+import unittest
+
+
+class TestBF16DataType(unittest.TestCase):
+    def test_matmul(self):
+        a_bf16 = np.random.random((6, 7)).astype(bfloat16)
+        b_bf16 = np.random.random((7, 8)).astype(bfloat16)
+        c_bf16 = np.matmul(a_bf16, b_bf16)
+
+        a_fp32 = a_bf16.astype(np.float32)
+        b_fp32 = b_bf16.astype(np.float32)
+        c_fp32 = np.matmul(a_fp32, b_fp32)
+
+        self.assertTrue(np.allclose(c_bf16, c_fp32))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tcp_store.py b/python/paddle/fluid/tests/unittests/test_tcp_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..11e1e8cd059c8b5611bc2f25a2323df06fb00df6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tcp_store.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import datetime
+import paddle
+
+
+class TestTCPStore(unittest.TestCase):
+    def test_tcp_store(self):
+        store = paddle.fluid.core.TCPStore("127.0.0.1", 6170, True, 1,
+                                           datetime.timedelta(0))
+        store.add("my", 3)
+        ret1 = store.get('my')
+        store.add("my", 3)
+        ret2 = store.get('my')
+        self.assertEqual(ret1[0] + 3, ret2[0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index 52256766fed7585cc5815e636ecff8403d382c5e..3238876b89414b89d09a8b4161ef9e5ba2450261 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -19,6 +19,7 @@ import numpy as np
 
 import paddle
 import paddle.nn as nn
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class SimpleNet(nn.Layer):
@@ -64,7 +65,7 @@ class TestTensorRegisterHook(unittest.TestCase):
         if paddle.is_compiled_with_cuda():
             self.devices.append("gpu")
 
-    def test_hook_for_interior_var(self):
+    def func_hook_for_interior_var(self):
         def run_double_hook_for_interior_var(double_hook, removed=False):
             for device in self.devices:
                 paddle.set_device(device)
@@ -154,7 +155,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         # register hook and removed
         run_print_hook_for_interior_var(print_hook, removed=True)
 
-    def test_hook_for_leaf_var(self):
+    def test_hook_for_interior_var(self):
+        with _test_eager_guard():
+            self.func_hook_for_interior_var()
+        self.func_hook_for_interior_var()
+
+    def func_hook_for_leaf_var(self):
         def run_double_hook_for_leaf_var(double_hook, removed=False):
             for device in self.devices:
                 paddle.set_device(device)
@@ -193,7 +199,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         # register hook and removed
         run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
 
-    def test_hook_for_accumulated_grad_interior_var(self):
+    def test_hook_for_leaf_var(self):
+        with _test_eager_guard():
+            self.func_hook_for_leaf_var()
+        self.func_hook_for_leaf_var()
+
+    def func_hook_for_accumulated_grad_interior_var(self):
         def run_double_hook_for_accumulated_grad_interior_var(double_hook,
                                                               removed=False):
             for device in self.devices:
@@ -248,7 +259,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         run_double_hook_for_accumulated_grad_interior_var(
             lambda grad: grad * 2, removed=True)
 
-    def test_hook_for_accumulated_grad_leaf_var(self):
+    def test_hook_for_accumulated_grad_interior_var(self):
+        with _test_eager_guard():
+            self.func_hook_for_accumulated_grad_interior_var()
+        self.func_hook_for_accumulated_grad_interior_var()
+
+    def func_hook_for_accumulated_grad_leaf_var(self):
         def run_double_hook_for_accumulated_grad_leaf_var(double_hook,
                                                           removed=False):
             for device in self.devices:
@@ -289,7 +305,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         run_double_hook_for_accumulated_grad_leaf_var(
             lambda grad: grad * 2, removed=True)
 
-    def test_hook_in_model(self):
+    def test_hook_for_accumulated_grad_leaf_var(self):
+        with _test_eager_guard():
+            self.func_hook_for_accumulated_grad_leaf_var()
+        self.func_hook_for_accumulated_grad_leaf_var()
+
+    def func_hook_in_model(self):
         def run_double_hook_in_model(data,
                                      label,
                                      hook=None,
@@ -336,7 +357,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm))
         self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm))
 
-    def test_multiple_hooks_for_interior_var(self):
+    def test_func_hook_in_model(self):
+        with _test_eager_guard():
+            self.func_hook_in_model()
+        self.func_hook_in_model()
+
+    def func_multiple_hooks_for_interior_var(self):
         def run_multiple_hooks_for_interior_var(device,
                                                 hooks,
                                                 remove1=False,
@@ -414,6 +440,12 @@ class TestTensorRegisterHook(unittest.TestCase):
             self.assertTrue(np.array_equal(x_grad, z))
             self.assertTrue(np.array_equal(y_grad, z))
 
+    def test_multiple_hooks_for_interior_var(self):
+        with _test_eager_guard():
+            self.func_multiple_hooks_for_interior_var()
+        self.func_multiple_hooks_for_interior_var()
+
+    # TODO(wuweilong): enable this case when DoubleGrad in eager mode is ready
     def test_hook_in_double_grad(self):
         def double_print_hook(grad):
             grad = grad * 2
@@ -446,7 +478,7 @@ class TestTensorRegisterHook(unittest.TestCase):
         z.backward()
         self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.])))
 
-    def test_remove_one_hook_multiple_times(self):
+    def func_remove_one_hook_multiple_times(self):
         for device in self.devices:
             paddle.set_device(device)
 
@@ -457,7 +489,12 @@ class TestTensorRegisterHook(unittest.TestCase):
             self.assertTrue(h.remove())
             self.assertFalse(h.remove())
 
-    def test_register_hook_for_stop_gradient_var(self):
+    def test_remove_one_hook_multiple_times(self):
+        with _test_eager_guard():
+            self.func_remove_one_hook_multiple_times()
+        self.func_remove_one_hook_multiple_times()
+
+    def func_register_hook_for_stop_gradient_var(self):
         for device in self.devices:
             paddle.set_device(device)
 
@@ -466,6 +503,11 @@ class TestTensorRegisterHook(unittest.TestCase):
             with self.assertRaises(RuntimeError):
                 x.register_hook(lambda grad: grad * 2)
 
+    def test_register_hook_for_stop_gradient_var(self):
+        with _test_eager_guard():
+            self.func_register_hook_for_stop_gradient_var()
+        self.func_register_hook_for_stop_gradient_var()
+
     def test_register_hook_in_static_mode(self):
         paddle.enable_static()
 
@@ -482,7 +524,7 @@ class TestTensorRegisterHook(unittest.TestCase):
 
         paddle.disable_static()
 
-    def test_register_hook_in_dy2static_mode(self):
+    def func_register_hook_in_dy2static_mode(self):
         net = SimpleNetForStatic(self.in_size, self.out_size)
         jit_net = paddle.jit.to_static(
             net, input_spec=[paddle.static.InputSpec([None, self.in_size])])
@@ -491,8 +533,17 @@ class TestTensorRegisterHook(unittest.TestCase):
             size=[self.batch_size, self.in_size]).astype('float32')
         data_t = paddle.to_tensor(data)
 
-        with self.assertRaises(AssertionError):
-            out = jit_net(data_t)
+        if _in_eager_mode():
+            with self.assertRaises(TypeError):
+                out = jit_net(data_t)
+        else:
+            with self.assertRaises(AssertionError):
+                out = jit_net(data_t)
+
+    def test_register_hook_in_dy2static_mode(self):
+        with _test_eager_guard():
+            self.func_register_hook_in_dy2static_mode()
+        self.func_register_hook_in_dy2static_mode()
 
 
 HOOK_INIT_VALUE = 10
@@ -512,7 +563,7 @@ class TestTensorRegisterBackwardHook(unittest.TestCase):
         if paddle.is_compiled_with_cuda():
             self.devices.append("gpu")
 
-    def test_register_backward_hook(self):
+    def func_register_backward_hook(self):
         global HOOK_INIT_VALUE
         global HOOK_IS_CALLED
         for device in self.devices:
@@ -529,20 +580,35 @@ class TestTensorRegisterBackwardHook(unittest.TestCase):
             HOOK_INIT_VALUE = 10
             HOOK_IS_CALLED = False
 
-    def test_register_backward_hook_for_interior_var(self):
+    def test_register_backward_hook(self):
+        with _test_eager_guard():
+            self.func_register_backward_hook()
+        self.func_register_backward_hook()
+
+    def func_register_backward_hook_for_interior_var(self):
         x = paddle.to_tensor(5., stop_gradient=False)
         y = paddle.pow(x, 4.0)
 
         with self.assertRaises(ValueError):
             y._register_backward_hook(global_void_hook)
 
-    def test_register_backward_hook_for_var_without_gradient(self):
+    def test_register_backward_hook_for_interior_var(self):
+        with _test_eager_guard():
+            self.func_register_backward_hook_for_interior_var()
+        self.func_register_backward_hook_for_interior_var()
+
+    def func_register_backward_hook_for_var_without_gradient(self):
         x = paddle.to_tensor(5.)
         y = paddle.pow(x, 4.0)
 
         with self.assertRaises(ValueError):
             x._register_backward_hook(global_void_hook)
 
+    def test_register_backward_hook_for_var_without_gradient(self):
+        with _test_eager_guard():
+            self.func_register_backward_hook_for_var_without_gradient()
+        self.func_register_backward_hook_for_var_without_gradient()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index a84c3b20da26c9f7ab8792caae038e1c432c6659..41b6ed36d65cccfad093de332ed81b007d77d3ce 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+import os
 import subprocess
 import unittest
 import numpy as np
@@ -568,13 +569,13 @@ class TestRandomValue(unittest.TestCase):
         if not paddle.is_compiled_with_cuda():
             return
 
-        # Note(zhouwei): The Number of threads is determined by 
-        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
-        # GPU have different number of threads, which result in different 
-        # random value. Only test on V100 GPU here.
+        # Different GPU generate different random value. Only test V100 here.
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
         def _check_random_value(dtype, expect, expect_mean, expect_std):
             x = paddle.rand([32, 3, 1024, 1024], dtype=dtype)
             actual = x.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py
index d601117b96f12d35756b521b85902bf91ef01bae..7fb4d39cd7338fb3cd57c786bc811b901351eaf9 100644
--- a/python/paddle/fluid/tests/unittests/test_where_op.py
+++ b/python/paddle/fluid/tests/unittests/test_where_op.py
@@ -139,6 +139,28 @@ class TestWhereAPI(unittest.TestCase):
                               fetch_list=[result])
                 assert np.array_equal(out[0], np.where((x_i > 1), x_i, y_i))
 
+    def test_scalar(self):
+        paddle.enable_static()
+        main_program = Program()
+        with fluid.program_guard(main_program):
+            cond_shape = [2, 4]
+            cond = fluid.layers.data(
+                name='cond', shape=cond_shape, dtype='bool')
+            x_data = 1.0
+            y_data = 2.0
+            cond_data = np.array([False, False, True, True]).astype('bool')
+            result = paddle.where(condition=cond, x=x_data, y=y_data)
+            for use_cuda in [False, True]:
+                if (use_cuda and (not fluid.core.is_compiled_with_cuda())):
+                    return
+                place = (fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace())
+                exe = fluid.Executor(place)
+                out = exe.run(fluid.default_main_program(),
+                              feed={'cond': cond_data},
+                              fetch_list=[result])
+                expect = np.where(cond_data, x_data, y_data)
+                assert np.array_equal(out[0], expect)
+
     def __test_where_with_broadcast_static(self, cond_shape, x_shape, y_shape):
         paddle.enable_static()
         main_program = Program()
@@ -227,6 +249,15 @@ class TestWhereDygraphAPI(unittest.TestCase):
             out = paddle.where(cond, x, y)
             assert np.array_equal(out.numpy(), np.where(cond_i, x_i, y_i))
 
+    def test_scalar(self):
+        with fluid.dygraph.guard():
+            cond_i = np.array([False, False, True, True]).astype('bool')
+            x = 1.0
+            y = 2.0
+            cond = fluid.dygraph.to_variable(cond_i)
+            out = paddle.where(cond, x, y)
+            assert np.array_equal(out.numpy(), np.where(cond_i, x, y))
+
     def __test_where_with_broadcast_dygraph(self, cond_shape, a_shape, b_shape):
         with fluid.dygraph.guard():
             cond_tmp = paddle.rand(cond_shape)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 3436e443ab853adf9cde82a84280d11d5a625f54..6a7e5f08b5e489511d2060d92dc0390dd4b18dbc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -229,7 +229,7 @@ def gelu(x, approximate):
     return y_ref.astype(x.dtype)
 
 
-class XPUTestHardSwishGeluOP(XPUOpTestWrapper):
+class XPUTestHardSwishOP(XPUOpTestWrapper):
     def __init__(self):
         self.op_name = 'hard_swish'
         self.use_dynamic_create_class = False
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
index cbdd9db8ee7f2c4d297e6c27b8f6ee006c2b19f4..519a185250ab06a629a88240dea7f6008265b9ad 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
@@ -18,118 +18,93 @@ import unittest
 import numpy as np
 import sys
 sys.path.append("..")
+
+import paddle
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
-import paddle
-import paddle.fluid.core as core
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class XPUBaseTestCase(XPUOpTest):
-    def initTestCase(self):
-        self.dims = (3, 4)
-        self.dtype = 'float32'
-        self.axis = 1
+class XPUTestArgMax(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'arg_max'
 
-    def setUp(self):
-        self.initTestCase()
-        self.__class__.op_type = 'arg_max'
-        self.__class__.use_xpu = True
-        np.random.seed(2021)
-        self.x = (np.random.random(self.dims)).astype(self.dtype)
-        self.inputs = {'X': self.x}
-        self.attrs = {'axis': self.axis, 'use_xpu': True}
-        if self.op_type == "arg_min":
-            self.outputs = {'Out': np.argmin(self.x, axis=self.axis)}
-        else:
-            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-
-# test argmax, dtype: float32
-class TestArgMaxFloat32Case1(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = -1
+    class XPUBaseTestCase(XPUOpTest):
+        def initTestCase(self):
+            self.dims = (3, 4)
+            self.axis = 1
 
+        def setUp(self):
+            self.op_type = 'arg_max'
+            self.dtype = self.in_type
+            self.initTestCase()
 
-class TestArgMaxFloat32Case2(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = 0
-
-
-class TestArgMaxFloat32Case3(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = 1
-
-
-class TestArgMaxFloat32Case4(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = 2
-
-
-class TestArgMaxFloat32Case5(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4)
-        self.dtype = 'float32'
-        self.axis = -1
-
-
-class TestArgMaxFloat32Case6(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4)
-        self.dtype = 'float32'
-        self.axis = 0
-
-
-class TestArgMaxFloat32Case7(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4)
-        self.dtype = 'float32'
-        self.axis = 1
-
-
-class TestArgMaxFloat32Case8(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (1, )
-        self.dtype = 'float32'
-        self.axis = 0
-
-
-class TestArgMaxFloat32Case9(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (2, )
-        self.dtype = 'float32'
-        self.axis = 0
-
+            self.x = (np.random.random(self.dims)).astype(self.dtype)
+            self.inputs = {'X': self.x}
+            self.attrs = {'axis': self.axis, 'use_xpu': True}
+            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
 
-class TestArgMaxFloat32Case10(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, )
-        self.dtype = 'float32'
-        self.axis = 0
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+    class TestArgMaxCase1(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4, 5)
+            self.axis = -1
+
+    class TestArgMaxCase2(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4, 5)
+            self.axis = 0
+
+    class TestArgMaxCase3(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4, 5)
+            self.axis = 1
+
+    class TestArgMaxCase4(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4, 5)
+            self.axis = 2
+
+    class TestArgMaxCase5(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4)
+            self.axis = -1
+
+    class TestArgMaxCase6(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4)
+            self.axis = 0
+
+    class TestArgMaxCase7(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4)
+            self.axis = 1
+
+    class TestArgMaxCase8(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (1, )
+            self.axis = 0
+
+    class TestArgMaxCase9(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (2, )
+            self.axis = 0
+
+    class TestArgMaxCase10(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, )
+            self.axis = 0
+
+
+support_types = get_xpu_op_support_types('arg_max')
+for stype in support_types:
+    create_test_class(globals(), XPUTestArgMax, stype)
 
 
 class TestArgMaxAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index 9cd34c82650e9b5d474af735277843f0c17fd1c0..f401a9a5374872f391294ee2833aa8b85127bb22 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -296,7 +296,9 @@ class TestXPUBatchNormOpUseGlobalStats(unittest.TestCase):
                     net2.training = False
                 y1 = net1(x)
                 y2 = net2(x)
-                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+                self.assertEqual(
+                    np.allclose(
+                        y1.numpy(), y2.numpy(), atol=1e-4), True)
 
 
 class TestXPUBatchNormUseGlobalStatsCase1(TestXPUBatchNormOpUseGlobalStats):
@@ -320,5 +322,12 @@ class TestXPUBatchNormUseGlobalStatsCase3(TestXPUBatchNormOpUseGlobalStats):
         self.trainable_statistics = True
 
 
+class TestXPUBatchNormUseGlobalStatsCase4(TestXPUBatchNormOpUseGlobalStats):
+    ### train mode
+    def init_test(self):
+        self.use_global_stats = True
+        self.trainable_statistics = False
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cc97ccc82f7ec0b0619658e45c8c15dc68ddb71
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
@@ -0,0 +1,341 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import sys
+sys.path.append("..")
+import paddle
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+import unittest
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp(XPUOpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+        self.init_max_relative_error()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'],
+                'Out',
+                max_relative_error=self.max_relative_error)
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=self.max_relative_error)
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=self.max_relative_error)
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_max_relative_error(self):
+        self.max_relative_error = 0.006
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
+        self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
+
+            # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestAddOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_declarative(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+
+            place = fluid.XPUPlace(0)
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((z_value == z_expected).all(), True)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float32')
+            np_y = np.array([1, 5, 2]).astype('float32')
+            x = fluid.dygraph.to_variable(np_x)
+            y = fluid.dygraph.to_variable(np_y)
+            z = paddle.add(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
index bdf74018abb58518018e3321abae2eae91f379c5..f0e6315514fb5422f745fb24356feaced73e1dd9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
@@ -20,9 +20,8 @@ sys.path.append("..")
 import numpy as np
 
 import paddle
-import paddle.fluid as fluid
-from op_test import OpTest
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -34,194 +33,87 @@ def gather_numpy(x, index, axis):
     return gather
 
 
-class TestXPUGatherOp(XPUOpTest):
-    def setUp(self):
-        self.op_type = "gather"
-        self.use_xpu = True
-        self.use_mkldnn = False
-        self.attrs = {'use_xpu': True}
-
-        self.config()
-        xnp = np.random.random(self.x_shape).astype(self.x_type)
-        self.inputs = {
-            'X': xnp,
-            'Index': np.array(self.index).astype(self.index_type)
-        }
-        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
-
-    def config(self):
-        """
-        For multi-dimension input
-        """
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.x_type = np.float32
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
-
-
-class TestCase1(TestXPUGatherOp):
-    def config(self):
-        """
-        For one dimension input
-        """
-        self.dtype = np.float32
-        self.x_shape = (100)
-        self.x_type = np.float32
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-
-class TestCase2(TestXPUGatherOp):
-    def config(self):
-        """
-        For int64_t index type
-        """
-        self.dtype = np.float32
-        self.x_shape = (100)
-        self.x_type = np.float32
-        self.index = [1, 3, 5]
-        self.index_type = np.int64
-
-
-class TestCase3(TestXPUGatherOp):
-    def config(self):
-        """
-        For other input type
-        """
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.x_type = np.float32
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-
-class TestCase4(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = np.float32
-        self.index = [1, 1]
-        self.index_type = np.int32
-
-
-class TestCase5(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = np.float32
-        self.index = [1, 1, 3]
-        self.index_type = np.int32
-
-
-class TestCase6(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = np.float32
-        self.index = [1, 3]
-        self.index_type = np.int32
-
-
-class TestCase7(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = np.float32
-        self.index = [1, 3]
-        self.index_type = np.int64
-
-
-## test fp16
-class TestCaseFP161(TestXPUGatherOp):
-    def config(self):
-        """
-        For one dimension input
-        """
-        self.dtype = np.float16
-        self.x_shape = (100)
-        self.x_type = np.float16
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-
-class TestCaseFP162(TestXPUGatherOp):
-    def config(self):
-        """
-        For int64_t index type
-        """
-        self.dtype = np.float16
-        self.x_shape = (100)
-        self.x_type = np.float16
-        self.index = [1, 3, 5]
-        self.index_type = np.int64
-
-
-class TestCaseFP163(TestXPUGatherOp):
-    def config(self):
-        """
-        For other input type
-        """
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.x_type = np.float16
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-
-class TestCaseFP164(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = np.float16
-        self.index = [1, 1]
-        self.index_type = np.int32
-
-
-class TestCaseFP165(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = np.float16
-        self.index = [1, 1, 3]
-        self.index_type = np.int32
-
-
-class TestCaseFP166(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = np.float16
-        self.index = [1, 3]
-        self.index_type = np.int32
-
-
-class TestCaseFP167(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = np.float16
-        self.index = [1, 3]
-        self.index_type = np.int64
-
+class XPUTestGather(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'gather'
+
+    class TestXPUGatherOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "gather"
+            self.place = paddle.XPUPlace(0)
+            self.dtype = self.in_type
+
+            self.init_config()
+            xnp = np.random.random(self.x_shape).astype(self.dtype)
+            self.inputs = {
+                'X': xnp,
+                'Index': np.array(self.index).astype(self.index_type)
+            }
+            self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.index = [1, 3, 5]
+            self.index_type = np.int32
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if paddle.is_compiled_with_xpu():
+                self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    class TestCase1(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (100)
+            self.index = [1, 3, 5]
+            self.index_type = np.int32
+
+    class TestCase2(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (100)
+            self.index = [1, 3, 5]
+            self.index_type = np.int64
+
+    class TestCase3(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.index = [1, 3, 5]
+            self.index_type = np.int32
+
+    class TestCase4(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.attrs = {'overwrite': False}
+            self.index = [1, 1]
+            self.index_type = np.int32
+
+    class TestCase5(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.attrs = {'overwrite': False}
+            self.index = [1, 1, 3]
+            self.index_type = np.int32
+
+    class TestCase6(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.attrs = {'overwrite': True}
+            self.index = [1, 3]
+            self.index_type = np.int32
+
+    class TestCase7(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.attrs = {'overwrite': True}
+            self.index = [1, 3]
+            self.index_type = np.int64
+
+
+support_types = get_xpu_op_support_types('gather')
+for stype in support_types:
+    create_test_class(globals(), XPUTestGather, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
index 8c5b3f3d8a9afa1ac7e812c54b9f05bf33ec07ab..990594e1f9edfc46ed0bf30825846ee4a065d451 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
@@ -18,10 +18,11 @@ import numpy as np
 import unittest
 import sys
 sys.path.append("..")
-from op_test import OpTest
-from op_test_xpu import XPUOpTest
+
 import paddle
 import paddle.fluid as fluid
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -34,61 +35,42 @@ def np_masked_select(x, mask):
     return result.flatten()
 
 
-class TestMaskedSelectOp(XPUOpTest):
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def setUp(self):
-        self.set_xpu()
-        self.init()
-        self.init_dtype()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "masked_select"
-        x = np.random.random(self.shape).astype(self.dtype)
-        mask = np.array(np.random.randint(2, size=self.shape, dtype=bool))
-        out = np_masked_select(x, mask)
-        self.inputs = {'X': x, 'Mask': mask}
-        self.outputs = {'Y': out}
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-    def init(self):
-        self.shape = (50, 3)
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-
-class TestMaskedSelectOp1(TestMaskedSelectOp):
-    def init(self):
-        self.shape = (6, 8, 9, 18)
+class XPUTestMaskedSelectOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'masked_select'
 
+    class TestMaskedSelectOp(XPUOpTest):
+        def setUp(self):
+            self.init()
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "masked_select"
+            self.__class__.no_need_check_grad = True
 
-class TestMaskedSelectOp2(TestMaskedSelectOp):
-    def init(self):
-        self.shape = (168, )
+            x = np.random.random(self.shape).astype(self.dtype)
+            mask = np.array(np.random.randint(2, size=self.shape, dtype=bool))
+            out = np_masked_select(x, mask)
+            self.inputs = {'X': x, 'Mask': mask}
+            self.outputs = {'Y': out}
 
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
 
-class TestMaskedSelectOpInt32(TestMaskedSelectOp):
-    def init_dtype(self):
-        self.dtype = np.int32
+        def init(self):
+            self.shape = (50, 3)
 
-    # skip_check_grad_ci(reason="get_numeric_gradient not support int32")
-    def test_check_grad(self):
-        pass
+    class TestMaskedSelectOp1(TestMaskedSelectOp):
+        def init(self):
+            self.shape = (6, 8, 9, 18)
 
+    class TestMaskedSelectOp2(TestMaskedSelectOp):
+        def init(self):
+            self.shape = (168, )
 
-class TestMaskedSelectOpInt64(TestMaskedSelectOp):
-    def init_dtype(self):
-        self.dtype = np.int64
 
-    # skip_check_grad_ci(reason="get_numeric_gradient not support int64")
-    def test_check_grad(self):
-        pass
+support_types = get_xpu_op_support_types('masked_select')
+for stype in support_types:
+    create_test_class(globals(), XPUTestMaskedSelectOp, stype)
 
 
 class TestMaskedSelectAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
index 8de8125166fb33d826ceb8ba244f56c0e3753012..8c1ce68e9d0f8b83b916ab45139745b224eff4f1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
@@ -81,7 +81,80 @@ def nearest_neighbor_interp_np(X,
 
     if data_layout == "NHWC":
         out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+    # out = np.expand_dims(out, 2)
+    return out.astype(X.dtype)
+
+
+def nearest_neighbor_interp3d_np(X,
+                                 out_d,
+                                 out_h,
+                                 out_w,
+                                 scale_d=0,
+                                 scale_h=0,
+                                 scale_w=0,
+                                 out_size=None,
+                                 actual_shape=None,
+                                 align_corners=True,
+                                 data_layout='NCHW'):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
+    if out_size is not None:
+        out_d = out_size[0]
+        out_h = out_size[1]
+        out_w = out_size[2]
+    if actual_shape is not None:
+        out_d = actual_shape[0]
+        out_h = actual_shape[1]
+        out_w = actual_shape[2]
+    n, c, in_d, in_h, in_w = X.shape
 
+    ratio_d = ratio_h = ratio_w = 0.0
+    if (out_d > 1):
+        if (align_corners):
+            ratio_d = (in_d - 1.0) / (out_d - 1.0)
+        else:
+            if scale_d > 0:
+                ratio_d = 1.0 / scale_d
+            else:
+                ratio_d = 1.0 * in_d / out_d
+    if (out_h > 1):
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
+    if (out_w > 1):
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
+    out = np.zeros((n, c, out_d, out_h, out_w))
+
+    if align_corners:
+        for d in range(out_d):
+            in_d = int(ratio_d * d + 0.5)
+            for i in range(out_h):
+                in_i = int(ratio_h * i + 0.5)
+                for j in range(out_w):
+                    in_j = int(ratio_w * j + 0.5)
+                    out[:, :, d, i, j] = X[:, :, in_d, in_i, in_j]
+    else:
+        for d in range(out_d):
+            in_d = int(ratio_d * d)
+            for i in range(out_h):
+                in_i = int(ratio_h * i)
+                for j in range(out_w):
+                    in_j = int(ratio_w * j)
+                    out[:, :, d, i, j] = X[:, :, in_d, in_i, in_j]
+
+    if data_layout == "NDHWC":
+        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
     return out.astype(X.dtype)
 
 
@@ -90,46 +163,86 @@ class TestNearestInterpOp(XPUOpTest):
         self.use_xpu = True
         self.out_size = None
         self.actual_shape = None
+        self.data_layout = 'NCHW'
         self.init_test_case()
         self.op_type = "nearest_interp_v2"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
         input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
 
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale:
+        if self.data_layout == "NCHW" and len(self.input_shape) == 4:
+            in_d = 1
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_d = 1
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
+            in_d = self.input_shape[2]
+            in_h = self.input_shape[3]
+            in_w = self.input_shape[4]
+        else:
+            in_d = self.input_shape[1]
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        scale_d = 0
+        scale_h = 0
+        scale_w = 0
+        if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0:
-                    scale_h = scale_w = float(self.scale)
+                    scale_d = scale_h = scale_w = float(self.scale)
             if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_w = scale_h = self.scale[0]
+                scale_d = scale_w = scale_h = self.scale[0]
             elif isinstance(self.scale, list) and len(self.scale) > 1:
-                scale_w = self.scale[1]
-                scale_h = self.scale[0]
-            out_h = int(self.input_shape[2] * scale_h)
-            out_w = int(self.input_shape[3] * scale_w)
+                if len(self.scale) == 5:
+                    scale_w = self.scale[2]
+                    scale_h = self.scale[1]
+                    scale_d = self.scale[0]
+                else:
+                    scale_w = self.scale[1]
+                    scale_h = self.scale[0]
+
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+            out_d = int(in_d * scale_d)
         else:
+            if len(self.input_shape) == 5:
+                out_d = self.out_d
             out_h = self.out_h
             out_w = self.out_w
 
-        if self.shape_by_1Dtensor:
+        if len(self.input_shape) == 4:
+            output_np = nearest_neighbor_interp_np(
+                input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+                self.actual_shape, self.align_corners, self.data_layout)
+        elif len(self.input_shape) == 5:
+            output_np = nearest_neighbor_interp3d_np(
+                input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
+                self.out_size, self.actual_shape, self.align_corners,
+                self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        if len(self.input_shape) == 5:
+            self.attrs = {
+                'out_d': self.out_d,
+                'out_h': self.out_h,
+                'out_w': self.out_w,
+                'interp_method': self.interp_method,
+                'align_corners': self.align_corners,
+                'data_layout': self.data_layout
+            }
+        else:
+            self.attrs = {
+                'out_h': self.out_h,
+                'out_w': self.out_w,
+                'interp_method': self.interp_method,
+                'align_corners': self.align_corners,
+                'data_layout': self.data_layout
+            }
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0:
@@ -137,9 +250,6 @@ class TestNearestInterpOp(XPUOpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
-                                               self.out_size, self.actual_shape,
-                                               self.align_corners)
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
@@ -154,22 +264,26 @@ class TestNearestInterpOp(XPUOpTest):
 
     def init_test_case(self):
         self.interp_method = 'nearest'
-        self.input_shape = [2, 5, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
+        self.input_shape = [2, 3, 4, 5]
+        self.out_h = 2
+        self.out_w = 2
         self.scale = 0.
-        self.out_size = [3, 3]
+        self.out_size = np.array([3, 3]).astype("int32")
         self.align_corners = True
 
 
+"""
+# case copied form gpu but disabled in xpu: not support 5-dim input_shape
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
     def init_test_case(self):
         self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
+        self.input_shape = [4, 1, 1, 7, 8]
+        self.out_d = 1
         self.out_h = 1
         self.out_w = 1
         self.scale = 0.
         self.align_corners = True
+"""
 
 
 class TestNearestNeighborInterpCase2(TestNearestInterpOp):
@@ -246,6 +360,8 @@ class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
         self.align_corners = True
 
 
+"""
+# case copied form gpu but disabled in xpu: not support NHWC data_layout
 class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
     def init_test_case(self):
         self.interp_method = 'nearest'
@@ -256,6 +372,7 @@ class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
         self.out_size = np.array([3, 8]).astype("int32")
         self.align_corners = True
         self.data_layout = "NHWC"
+"""
 
 
 class TestNearestInterpWithoutCorners(TestNearestInterpOp):
@@ -296,6 +413,21 @@ class TestNearestNeighborInterpScale3(TestNearestInterpOp):
         self.align_corners = True
 
 
+"""
+# case copied form gpu but disabled in xpu: not support 5-dim input_shape
+class TestNearestNeighbor3DInterp(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 4, 7, 5]
+        self.out_d = 8
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [4.0, 2.0, 3.0]
+        self.out_size = np.array([8, 66, 40]).astype("int32")
+        self.align_corners = True
+"""
+
+
 class TestNearestInterpOp_attr_tensor(XPUOpTest):
     def setUp(self):
         self.use_xpu = True
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
index f2a078fcd2db1d9a43a339c5a4261b49e8f6c63b..f9c49a81ef30cb8ae87ae188d11ddba87eb94692 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
@@ -20,57 +20,70 @@ import numpy as np
 import sys
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class TestRangeOp(XPUOpTest):
-    def setUp(self):
-        self.op_type = "range"
-        self.init_config()
-        self.inputs = {
-            'Start': np.array([self.case[0]]).astype(self.dtype),
-            'End': np.array([self.case[1]]).astype(self.dtype),
-            'Step': np.array([self.case[2]]).astype(self.dtype)
-        }
+class XPUTestRangeOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "range"
+        self.use_dynamic_create_class = False
 
-        self.outputs = {
-            'Out': np.arange(self.case[0], self.case[1],
-                             self.case[2]).astype(self.dtype)
-        }
+    class TestRangeOp(XPUOpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "range"
+            self.init_dtype()
+            self.init_config()
+            self.inputs = {
+                'Start': np.array([self.case[0]]).astype(self.dtype),
+                'End': np.array([self.case[1]]).astype(self.dtype),
+                'Step': np.array([self.case[2]]).astype(self.dtype)
+            }
 
-    def init_config(self):
-        self.dtype = np.float32
-        self.case = (0, 1, 0.2)
+            self.outputs = {
+                'Out': np.arange(self.case[0], self.case[1],
+                                 self.case[2]).astype(self.dtype)
+            }
 
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, check_dygraph=False)
+        def set_xpu(self):
+            self.__class__.no_need_check_grad = True
 
+        def init_dtype(self):
+            self.dtype = self.in_type
 
-class TestFloatRangeOpCase0(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.float32
-        self.case = (0, 5, 1)
+        def init_config(self):
+            self.case = (0, 1, 0.2) if self.dtype == np.float32 else (0, 5, 1)
 
+        def test_check_output(self):
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, check_dygraph=False)
 
-class TestInt32RangeOpCase0(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.int32
-        self.case = (0, 5, 2)
+    class TestRangeOpCase0(TestRangeOp):
+        def init_config(self):
+            self.case = (0, 5, 1)
 
+    class TestRangeOpCase1(TestRangeOp):
+        def init_config(self):
+            self.case = (0, 5, 2)
 
-class TestInt32RangeOpCase1(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.int32
-        self.case = (10, 1, -2)
+    class TestRangeOpCase2(TestRangeOp):
+        def init_config(self):
+            self.case = (10, 1, -2)
 
+    class TestRangeOpCase3(TestRangeOp):
+        def init_config(self):
+            self.case = (-1, -10, -2)
 
-class TestInt32RangeOpCase2(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.int32
-        self.case = (-1, -10, -2)
+    class TestRangeOpCase4(TestRangeOp):
+        def init_config(self):
+            self.case = (10, -10, -11)
 
 
+support_types = get_xpu_op_support_types("range")
+for stype in support_types:
+    create_test_class(globals(), XPUTestRangeOp, stype)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
index 1a21b0f1972b7a33600c95e21c9a8419e3c9f6b2..0b000fc924ac1d06649b8c929e834d67d634e853 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
@@ -14,194 +14,167 @@
 
 from __future__ import print_function
 
-import unittest
 import numpy as np
 import sys
-
+import unittest
 sys.path.append("..")
-from op_test import OpTest
+
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
-
-
-# situation 1: have shape( list, no tensor), no actual shape(Tensor)
-class TestReshapeOp(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.attrs = {"shape": self.new_shape, "use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (2, 60)
-        self.new_shape = (12, 10)
-        self.infered_shape = (12, 10)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-class TestReshapeOpDimInfer1(TestReshapeOp):
-    def init_data(self):
-        self.ori_shape = (5, 25)
-        self.new_shape = (5, -1, 5)
-        self.infered_shape = (5, -1, 5)
-
-
-class TestReshapeOpDimInfer2(TestReshapeOp):
-    def init_data(self):
-        self.ori_shape = (10, 2, 6)
-        self.new_shape = (10, 0, 3, -1)
-        self.infered_shape = (10, 2, 3, -1)
-
-
-# situation 2: have shape(list, no tensor), have actual shape(Tensor)
-class TestReshapeOpWithInputShape(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.actual_shape, dtype="int32")
-        }
-        self.attrs = {"shape": self.new_shape, "use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.actual_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (6, 20)
-        self.new_shape = (0, -1, 20)
-        self.actual_shape = (2, 3, 20)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-# Situation 3: have shape(list, have tensor), no actual shape(Tensor)
-class TestReshapeOp_attr_ShapeTensor(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        shape_tensor = []
-        for index, ele in enumerate(self.new_shape):
-            shape_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            'ShapeTensor': shape_tensor
-        }
-        self.attrs = {'shape': self.shape, "use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (4, 25)
-        self.new_shape = (10, 10)
-        self.infered_shape = (10, 10)
-        self.shape = (-1, -1)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
-    def init_data(self):
-        self.ori_shape = (5, 20)
-        self.new_shape = (5, -1, 20)
-        self.infered_shape = (5, -1, 20)
-        self.shape = (5, -1, -1)
-
-
-class TestReshapeOpDimInfer2_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
-    def init_data(self):
-        self.ori_shape = (10, 2, 6)
-        self.new_shape = (10, 0, 3, -1)
-        self.infered_shape = (10, 2, 3, -1)
-        self.shape = (10, 0, 3, -1)
-
-
-# Situation 4: have shape(Tensor), no actual shape(Tensor)
-class TestReshapeOp_attr_OnlyShape(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.new_shape, dtype="int32")
-        }
-        self.attrs = {"use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (4, 25)
-        self.new_shape = (10, 10)
-        self.infered_shape = (10, 10)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
-    def init_data(self):
-        self.ori_shape = (5, 20)
-        self.new_shape = (5, -1, 10)
-        self.infered_shape = (5, -1, 10)
-        self.shape = (5, -1, -1)
-
-
-class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
-    def init_data(self):
-        self.ori_shape = (10, 2, 6)
-        self.new_shape = (10, 0, 3, -1)
-        self.infered_shape = (10, 2, 3, -1)
-        self.shape = (10, 0, 3, -1)
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReshapeOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "reshape2"
+        self.use_dynamic_create_class = False
+
+    # situation 1: have shape( list, no tensor), no actual shape(Tensor)
+    class TestReshapeOp(XPUOpTest):
+        def setUp(self):
+            self.init_data()
+            self.op_type = "reshape2"
+            self.init_test_input()
+            self.init_test_output()
+            self.init_attrs()
+
+        def init_data(self):
+            self.ori_shape = (2, 60)
+            self.new_shape = (12, 10)
+            self.infered_shape = (12, 10)
+
+        def init_test_input(self):
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype)
+            }
+
+        def init_test_output(self):
+            self.outputs = {
+                "Out": self.inputs["X"].reshape(self.infered_shape),
+                'XShape': np.random.random(self.ori_shape).astype(self.dtype)
+            }
+
+        def init_attrs(self):
+            self.attrs = {"shape": self.new_shape, "use_xpu": True}
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place, no_check_set=['XShape'])
+
+        def test_check_grad(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_grad_with_place(place, ["X"], "Out")
+
+    class TestReshapeOpDimInfer1(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (5, 25)
+            self.new_shape = (5, -1, 5)
+            self.infered_shape = (5, -1, 5)
+
+    class TestReshapeOpDimInfer2(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (10, 2, 6)
+            self.new_shape = (10, 0, 3, -1)
+            self.infered_shape = (10, 2, 3, -1)
+
+    # situation 2: have shape(list, no tensor), have actual shape(Tensor)
+    class TestReshapeOpWithInputShape(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (6, 20)
+            self.new_shape = (0, -1, 20)
+            self.actual_shape = (2, 3, 20)
+
+        def init_test_input(self):
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype),
+                "Shape": np.array(
+                    self.actual_shape, dtype="int32")
+            }
+
+        def init_test_output(self):
+            self.outputs = {
+                "Out": self.inputs["X"].reshape(self.actual_shape),
+                'XShape': np.random.random(self.ori_shape).astype(self.dtype)
+            }
+
+    # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
+    class TestReshapeOp_attr_ShapeTensor(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (4, 25)
+            self.new_shape = (10, 10)
+            self.infered_shape = (10, 10)
+            self.shape = (-1, -1)
+
+        def init_test_input(self):
+            shape_tensor = []
+            for index, ele in enumerate(self.new_shape):
+                shape_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype),
+                'ShapeTensor': shape_tensor
+            }
+
+        def init_attrs(self):
+            self.attrs = {'shape': self.shape, "use_xpu": True}
+
+    class TestReshapeOpDimInfer1_attr_ShapeTensor(
+            TestReshapeOp_attr_ShapeTensor):
+        def init_data(self):
+            self.ori_shape = (5, 20)
+            self.new_shape = (5, -1, 20)
+            self.infered_shape = (5, -1, 20)
+            self.shape = (5, -1, -1)
+
+    class TestReshapeOpDimInfer2_attr_ShapeTensor(
+            TestReshapeOp_attr_ShapeTensor):
+        def init_data(self):
+            self.ori_shape = (10, 2, 6)
+            self.new_shape = (10, 0, 3, -1)
+            self.infered_shape = (10, 2, 3, -1)
+            self.shape = (10, 0, 3, -1)
+
+    # Situation 4: have shape(Tensor), no actual shape(Tensor)
+    class TestReshapeOp_attr_OnlyShape(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (4, 25)
+            self.new_shape = (10, 10)
+            self.infered_shape = (10, 10)
+
+        def init_test_input(self):
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype),
+                "Shape": np.array(
+                    self.new_shape, dtype="int32")
+            }
+
+        def init_attrs(self):
+            self.attrs = {"use_xpu": True}
+
+    class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+        def init_data(self):
+            self.ori_shape = (5, 20)
+            self.new_shape = (5, -1, 10)
+            self.infered_shape = (5, -1, 10)
+            self.shape = (5, -1, -1)
+
+    class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+        def init_data(self):
+            self.ori_shape = (10, 2, 6)
+            self.new_shape = (10, 0, 3, -1)
+            self.infered_shape = (10, 2, 3, -1)
+            self.shape = (10, 0, 3, -1)
+
+
+support_types = get_xpu_op_support_types("reshape2")
+for stype in support_types:
+    create_test_class(globals(), XPUTestReshapeOp, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
index 2122223dbec1b40c44d7a34ba9ba1ecd9bb7263c..e80b1e4c50ef23cf9ed0d969ca2a12b29d320b04 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
@@ -40,7 +40,8 @@ class TestROIAlignOp(XPUOpTest):
             'spatial_scale': self.spatial_scale,
             'pooled_height': self.pooled_height,
             'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio
+            'sampling_ratio': self.sampling_ratio,
+            'aligned': self.continuous_coordinate
         }
 
         self.outputs = {'Out': self.out_data}
@@ -51,6 +52,8 @@ class TestROIAlignOp(XPUOpTest):
         self.height = 8
         self.width = 6
 
+        self.xpu_version = core.get_xpu_device_version(0)
+
         # n, c, h, w
         self.x_dim = (self.batch_size, self.channels, self.height, self.width)
 
@@ -58,7 +61,10 @@ class TestROIAlignOp(XPUOpTest):
         self.pooled_height = 2
         self.pooled_width = 2
         self.sampling_ratio = -1
-
+        if self.xpu_version == core.XPUVersion.XPU1:
+            self.continuous_coordinate = False
+        else:
+            self.continuous_coordinate = bool(np.random.randint(2))
         self.x = np.random.random(self.x_dim).astype('float32')
 
     def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
@@ -124,12 +130,16 @@ class TestROIAlignOp(XPUOpTest):
             roi = self.rois[i]
             roi_batch_id = int(roi[0])
             x_i = self.x[roi_batch_id]
-            roi_xmin = roi[1] * self.spatial_scale
-            roi_ymin = roi[2] * self.spatial_scale
-            roi_xmax = roi[3] * self.spatial_scale
-            roi_ymax = roi[4] * self.spatial_scale
-            roi_width = max(roi_xmax - roi_xmin, 1)
-            roi_height = max(roi_ymax - roi_ymin, 1)
+            roi_offset = 0.5 if self.continuous_coordinate else 0
+            roi_xmin = roi[1] * self.spatial_scale - roi_offset
+            roi_ymin = roi[2] * self.spatial_scale - roi_offset
+            roi_xmax = roi[3] * self.spatial_scale - roi_offset
+            roi_ymax = roi[4] * self.spatial_scale - roi_offset
+            roi_width = roi_xmax - roi_xmin
+            roi_height = roi_ymax - roi_ymin
+            if not self.continuous_coordinate:
+                roi_width = max(roi_width, 1)
+                roi_height = max(roi_height, 1)
             bin_size_h = float(roi_height) / float(self.pooled_height)
             bin_size_w = float(roi_width) / float(self.pooled_width)
             roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
@@ -203,7 +213,8 @@ class TestROIAlignInLodOp(TestROIAlignOp):
             'spatial_scale': self.spatial_scale,
             'pooled_height': self.pooled_height,
             'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio
+            'sampling_ratio': self.sampling_ratio,
+            'aligned': self.continuous_coordinate
         }
 
         self.outputs = {'Out': self.out_data}
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
index f194f3ca80cf0c3e4512dda560337162facfd774..c7fa72ca7700e068a6093d869bdc4a9fd170f481 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
@@ -18,77 +18,99 @@ import unittest
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 import paddle
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 
-
-class TestShapeOp(OpTest):
-    def setUp(self):
-        self.op_type = "shape"
-        self.config()
-        self.shape = [2, 3]
-        input = np.zeros(self.shape)
-        self.inputs = {'Input': input}
-        self.outputs = {'Out': np.array(self.shape)}
-
-    def config(self):
-        self.shape = [2, 3]
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-
-class case1(TestShapeOp):
-    def config(self):
-        self.shape = [2]
-
-
-class case2(TestShapeOp):
-    def config(self):
-        self.shape = [1, 2, 3]
-
-
-class TestShapeWithSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        if core.is_compiled_with_xpu():
-            places.append(core.XPUPlace(0))
-        return places
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        x_rows = [0, 1, 5, 4, 19]
-        height = 20
-        row_numel = 2
-
-        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(x_rows)
-        x.set_height(height)
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        # initialize input variable Out
-        out_shape = scope.var("Out").get_tensor()
-        op = Operator("shape", Input="X", Out="Out")
-
-        op.run(scope, place)
-
-        out_shape = np.array(out_shape).tolist()
-        self.assertListEqual([5, 2], out_shape)
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
+paddle.enable_static()
+
+
+class XPUTestShapeOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "shape"
+        self.use_dynamic_create_class = False
+
+    class TestShapeOp(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.op_type = "shape"
+            self.config()
+            input = np.zeros(self.shape)
+            self.inputs = {'Input': input.astype(self.dtype)}
+            self.outputs = {'Out': np.array(self.shape)}
+
+        def config(self):
+            self.shape = [2, 3]
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+    class TestShapeOp1(TestShapeOp):
+        def config(self):
+            self.shape = [2]
+
+    class TestShapeOp2(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3]
+
+    class TestShapeOp3(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3, 4]
+
+    class TestShapeOp4(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3, 4, 1024]
+
+    class TestShapeOp5(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3, 4, 1, 201]
+
+    class TestShapeWithSelectedRows(unittest.TestCase):
+        def setUp(self):
+            self.dtype = self.in_type
+
+        def get_places(self):
+            places = [core.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(core.CUDAPlace(0))
+            if core.is_compiled_with_xpu():
+                places.append(core.XPUPlace(0))
+            return places
+
+        def check_with_place(self, place):
+            scope = core.Scope()
+            x_rows = [0, 1, 5, 4, 19]
+            height = 20
+            row_numel = 2
+
+            np_array = np.ones((len(x_rows), row_numel)).astype(self.dtype)
+
+            # initialize input variable X
+            x = scope.var('X').get_selected_rows()
+            x.set_rows(x_rows)
+            x.set_height(height)
+            x_tensor = x.get_tensor()
+            x_tensor.set(np_array, place)
+            out_shape = scope.var("Out").get_tensor()
+            op = Operator("shape", Input="X", Out="Out")
+
+            op.run(scope, place)
+
+            out_shape = np.array(out_shape).tolist()
+            self.assertListEqual([5, 2], out_shape)
+
+        def test_check_output(self):
+            for place in self.get_places():
+                self.check_with_place(place)
+
+
+support_types = get_xpu_op_support_types("shape")
+for stype in support_types:
+    create_test_class(globals(), XPUTestShapeOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 39320f5c0acf3b4bbad9c2e46c4789d8adc50504..cdc9b14b6e32813f51cf36e593cb1dd0715e7ed1 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -111,6 +111,10 @@ class TrainerDesc(object):
 
     def _set_fleet_desc(self, fleet_desc):
         self._fleet_desc = fleet_desc
+        ## serialize fleet_desc
+        from google.protobuf import text_format
+        fleet_desc_str = text_format.MessageToString(fleet_desc)
+        self.proto_desc.fleet_desc = fleet_desc_str
 
     def _gen_trainer_desc(self):
         pass
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 72e8e73ce7c2e51b9f7d1e38dba1098149ffcf89..b13aefb58c09e23320c1c8dc53d95e4ab53080ff 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -32,11 +32,25 @@ from ..fluid.core import MLUPlace  # noqa: F401
 from ..fluid.core import CustomPlace  # noqa: F401
 from ..fluid.core import VarBase  # noqa: F401
 
-from paddle.fluid import core  # noqa: F401
+from ..fluid import core  # noqa: F401
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
 from ..fluid.dygraph.base import grad  # noqa: F401
 from .io import save  # noqa: F401
 from .io import load  # noqa: F401
 from ..fluid.dygraph.parallel import DataParallel  # noqa: F401
 
+from ..fluid import monkey_patch_variable
+from ..fluid.dygraph import monkey_patch_math_varbase
+from ..fluid.framework import disable_signal_handler  # noqa: F401
+from ..fluid.framework import get_flags  # noqa: F401
+from ..fluid.framework import set_flags  # noqa: F401
+from ..fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
+from ..fluid.dygraph.base import disable_dygraph as enable_static  # noqa: F401
+from ..fluid.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
+from ..fluid.framework import _current_expected_place, _get_paddle_place  # noqa: F401
+from ..fluid.framework import dygraph_only  # noqa: F401
+from ..fluid.framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder  # noqa: F401
+from ..fluid.framework import _in_eager_mode  # noqa: F401
+from ..fluid.framework import _dygraph_tracer  # noqa: F401
+
 __all__ = []
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 8367205a7e7c2a5f77196771eb722f794ac66a30..94b8bd29b2c19b84323152c9566d4017ae4772c5 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -30,7 +30,7 @@ from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads
 from paddle.fluid.io import _legacy_save as _legacy_static_save
 from paddle.fluid.io import _open_file_buffer, _is_file_path, _is_memory_buffer
 
-from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place, Program
+from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, EagerParamBase, _current_expected_place, Program
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
@@ -42,7 +42,7 @@ def _build_saved_state_dict(state_dict):
     save_dict = {}
     name_table = {}
     for key, value in state_dict.items():
-        if isinstance(value, (Variable, core.VarBase)):
+        if isinstance(value, (Variable, core.VarBase, core.eager.Tensor)):
             if value.type == core.VarDesc.VarType.VOCAB:
                 save_dict[key] = value.value().get_map_tensor()
             else:
@@ -260,6 +260,8 @@ def _pickle_save(obj, f, protocol):
         # This is not a good method, because the pickle module has been modified.
         pickle.dispatch_table[core.VarBase] = reduce_varbase
         pickle.dispatch_table[ParamBase] = reduce_varbase
+        pickle.dispatch_table[core.eager.Tensor] = reduce_varbase
+        pickle.dispatch_table[EagerParamBase] = reduce_varbase
         pickle.dispatch_table[core.LoDTensor] = reduce_LoDTensor
         pickle.dispatch_table.update(dispatch_table_layer)
 
@@ -267,6 +269,8 @@ def _pickle_save(obj, f, protocol):
         pickle.dispatch_table.pop(core.VarBase)
         pickle.dispatch_table.pop(core.LoDTensor)
         pickle.dispatch_table.pop(ParamBase)
+        pickle.dispatch_table.pop(core.eager.Tensor)
+        pickle.dispatch_table.pop(EagerParamBase)
         for k in dispatch_table_layer:
             pickle.dispatch_table.pop(k)
 
@@ -286,6 +290,8 @@ def _pickle_save(obj, f, protocol):
         pickler.dispatch_table[core.VarBase] = reduce_varbase
         pickler.dispatch_table[core.LoDTensor] = reduce_LoDTensor
         pickler.dispatch_table[ParamBase] = reduce_varbase
+        pickler.dispatch_table[core.eager.Tensor] = reduce_varbase
+        pickler.dispatch_table[EagerParamBase] = reduce_varbase
         pickler.dispatch_table.update(dispatch_table_layer)
         pickler.dump(obj)
 
@@ -317,7 +323,8 @@ def _is_state_dict(obj):
 
         def condition(obj):
             return isinstance(obj, (fluid.Layer, Program, core.VarBase,
-                                    core.LoDTensor, core.SelectedRows))
+                                    core.eager.Tensor, core.LoDTensor,
+                                    core.SelectedRows))
 
         # If the value of a dict is a core.VarBase/LoDTensor or a dict 
         # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows), 
@@ -327,7 +334,8 @@ def _is_state_dict(obj):
                 for k, v in value.items():
                     if _contain_x(v, condition):
                         return False
-            elif not isinstance(value, (core.VarBase, core.LoDTensor)):
+            elif not isinstance(value, (core.VarBase, core.eager.Tensor,
+                                        core.LoDTensor)):
                 return False
         return True
 
@@ -412,8 +420,9 @@ def _parse_every_object(obj, condition_func, convert_func):
     elif type(obj) == set:
         return set(_parse_every_object(list(obj), condition_func, convert_func))
     else:
-        if isinstance(obj, collections.Iterable) and not isinstance(obj, (
-                str, np.ndarray, core.VarBase, core.LoDTensor)):
+        if isinstance(obj, collections.Iterable) and not isinstance(
+                obj,
+            (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor)):
             raise NotImplementedError(
                 "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".
                 format(type(obj)))
@@ -541,7 +550,7 @@ def _save_binary_var(obj, path):
         _save_lod_tensor(obj, path)
     elif isinstance(obj, core.SelectedRows):
         _save_selected_rows(obj, path)
-    elif isinstance(obj, core.VarBase):
+    elif isinstance(obj, (core.VarBase, core.eager.Tensor)):
         _save_lod_tensor(obj.value().get_tensor(), path)
     else:
         # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'
diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
index 6ed33f4f960b402fc97f32342a54c1c9ffd6e889..8020029be2a4e454af8834a09b87b5c4ba681bea 100644
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -20,6 +20,7 @@ import os
 import sys
 import time
 import numpy as np
+import struct
 from collections import namedtuple
 
 __all__ = []
@@ -79,6 +80,20 @@ class ProgressBar(object):
     def update(self, current_num, values={}):
         now = time.time()
 
+        def convert_uint16_to_float(in_list):
+            in_list = np.asarray(in_list)
+            out = np.vectorize(
+                lambda x: struct.unpack('<f', struct.pack('<I', x << 16))[0],
+                otypes=[np.float32])(in_list.flat)
+            return np.reshape(out, in_list.shape)
+
+        for i, (k, val) in enumerate(values):
+            if k == "loss":
+                val = val if isinstance(val, list) or isinstance(
+                    val, np.ndarray) else [val]
+                if isinstance(val[0], np.uint16):
+                    values[i] = ("loss", list(convert_uint16_to_float(val)))
+
         if current_num:
             time_per_unit = (now - self._start) / current_num
         else:
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 3569d372fa6dc7ef89b6d1f8e9e0f675ab89dde9..d600cda8454cc696579df7fa7f6e6f4d6ae12600 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -46,7 +46,7 @@ def fused_feedforward(x,
                       training=True,
                       mode='upscale_in_train',
                       name=None):
-    """
+    r"""
     This is a fusion operator to compute feed forward layer in transformer model architecture.
     This operator only supports running on GPU. The function of the operator is consistent with
     the following pseudo code:
@@ -230,7 +230,7 @@ def fused_multi_head_attention(x,
                                training=True,
                                mode='upscale_in_train',
                                name=None):
-    """
+    r"""
     Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces. This API only
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index 74c481fb641aca9a7249f1ce9585700e66301d3b..e7c3cfbb7b93b5deffb95e9ee175a7a03d1aaf7f 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -178,11 +178,13 @@ class DistributedFusedLamb(Optimizer):
         param_info = self._create_persistable_var('param_info', dtype='int32')
         param_info.is_distributed = True
 
-        fused_offsets = self._create_persistable_var('fused_offsets')
+        fused_offsets = self._create_persistable_var(
+            'fused_offsets', dtype='int32')
 
         fp32_partial_fused_offsets = self._create_persistable_var(
             'fp32_partial_fused_offsets', dtype='int32')
         fp32_partial_fused_offsets.is_distributed = True
+
         fp16_partial_fused_offsets = self._create_persistable_var(
             'fp16_partial_fused_offsets', dtype='int32')
         fp16_partial_fused_offsets.is_distributed = True
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index ad8f28f40bb58102a93fc98ddf260c6cc72ab1a2..c0820e140268b622f6c39aa86b955024076f343a 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -14,7 +14,6 @@
 
 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
-
 from ..fluid.dygraph.layers import Layer  # noqa: F401
 from ..fluid.dygraph.container import LayerList  # noqa: F401
 from ..fluid.dygraph.container import ParameterList  # noqa: F401
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index ac08ac9391eb343c7baf6bdf3e9ef74b32eea108..91449ef538ff3387077a32d0099bb50432490bea 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -22,11 +22,11 @@ from ...tensor.math import multiply
 
 import warnings
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
-from ...fluid import core
+from ...fluid.framework import convert_np_dtype_to_dtype_
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, in_dynamic_mode
+from paddle.framework import core
 
 __all__ = []
 
@@ -61,7 +61,7 @@ def celu(x, alpha=1.0, name=None):
     if alpha == 0:
         raise ZeroDivisionError("alpha cannot be 0 for celu")
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.celu(x, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
@@ -110,7 +110,7 @@ def elu(x, alpha=1.0, name=None):
             #  [ 1.          15.6      ]]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.elu(x, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
@@ -174,7 +174,7 @@ def gelu(x, approximate=False, name=None):
             #  [ 0.84119201,  1.39957154]]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.gelu(x, 'approximate', approximate)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'gelu')
@@ -222,7 +222,7 @@ def hardshrink(x, threshold=0.5, name=None):
             out = F.hardshrink(x) # [-1., 0., 2.5]
 
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.hard_shrink(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -273,7 +273,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
             out = F.hardtanh(x) # [-1., 0.3, 1.]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.brelu(x, 't_min', min, 't_max', max)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -328,7 +328,7 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
             out = F.hardsigmoid(x) # [0., 1., 0.666667]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -382,7 +382,7 @@ def hardswish(x, name=None):
             out = F.hardswish(x) # [0., 5., 0.666667]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.hard_swish(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -427,7 +427,7 @@ def leaky_relu(x, negative_slope=0.01, name=None):
             out = F.leaky_relu(x) # [-0.02, 0., 1.]
 
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.leaky_relu(x, 'alpha', negative_slope)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -518,7 +518,7 @@ def prelu(x, weight, data_format="NCHW", name=None):
                 1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
         mode = 'channel'
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.prelu(x, weight, 'mode', mode, 'data_format', data_format)
 
     helper = LayerHelper('prelu', **locals())
@@ -560,7 +560,7 @@ def relu(x, name=None):
             out = F.relu(x) # [0., 0., 1.]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.relu(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu')
@@ -605,7 +605,7 @@ def log_sigmoid(x, name=None):
             out = F.log_sigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.logsigmoid(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -672,7 +672,7 @@ def maxout(x, groups, axis=1, name=None):
             #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.maxout(x, 'groups', groups, 'axis', axis)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
@@ -721,7 +721,7 @@ def relu6(x, name=None):
             out = F.relu6(x) # [0, 0.3, 6]
     """
     threshold = 6.0
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.relu6(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6')
@@ -780,7 +780,7 @@ def selu(x,
         raise ValueError(
             "The alpha must be no less than zero. Received: {}.".format(alpha))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.selu(x, 'scale', scale, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'selu')
@@ -821,7 +821,7 @@ def silu(x, name=None):
             out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.silu(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'silu')
@@ -951,7 +951,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
         dtype = convert_np_dtype_to_dtype_(dtype)
     use_cudnn = True
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         outs_cast = x if dtype is None \
             else _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return _C_ops.softmax(outs_cast, 'axis', axis, 'use_cudnn', use_cudnn)
@@ -1026,7 +1026,7 @@ def softplus(x, beta=1, threshold=20, name=None):
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.softplus(x, 'beta', beta, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1081,7 +1081,7 @@ def softshrink(x, threshold=0.5, name=None):
             "The threshold must be no less than zero. Received: {}.".format(
                 threshold))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.softshrink(x, 'lambda', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1122,7 +1122,7 @@ def softsign(x, name=None):
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.softsign(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1160,7 +1160,7 @@ def swish(x, name=None):
             out = F.swish(x) # [-0.238406, 0., 0.731059]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.swish(x, 'beta', 1.0)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
@@ -1204,7 +1204,7 @@ def mish(x, name=None):
             x = paddle.to_tensor([-5., 0., 5.])
             out = F.mish(x) # [-0.03357624, 0., 4.99955208]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.mish(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mish')
@@ -1240,7 +1240,7 @@ def tanhshrink(x, name=None):
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.tanh_shrink(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1286,7 +1286,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
             out = F.thresholded_relu(x) # [2., 0., 0.]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.thresholded_relu(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1360,7 +1360,7 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
     if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if dtype is not None:
             x = _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return _C_ops.log_softmax(x, 'axis', axis)
@@ -1498,7 +1498,7 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
             # [0.00000000, 0.00000000, 0.00000000, 0.00001258, 0.99998736, 0.00000000]]
         
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.gumbel_softmax(x, 'temperature', temperature, 'hard',
                                      hard, 'axis', axis)
 
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 5a010ad2f20c55e92a11508407343caf0bf300c0..ed668ed124c2318c88cde2bd4acb31ce2b2e4f7c 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -14,13 +14,11 @@
 
 import warnings
 import paddle
-from ...fluid.framework import in_dygraph_mode, default_main_program
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.tensor import fill_constant
 from ...tensor import concat
 from ...tensor.creation import zeros
 from paddle.static import Variable
-from ...fluid.layers import core
 from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
 from ...fluid.layers import unfold  # noqa: F401
@@ -30,13 +28,17 @@ from ...tensor import clip
 from ...tensor import sum
 from ...tensor import sqrt
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
-from ...fluid.framework import in_dygraph_mode, _varbase_creator
+from ...fluid.framework import _varbase_creator
 
-from ...fluid.framework import in_dygraph_mode
-from ...fluid import core, dygraph_utils
-from ...fluid import core, layers
+from ...fluid import dygraph_utils
+from ...fluid import layers
 from ...fluid.data_feeder import check_variable_and_dtype
+
 from paddle import _C_ops
+from paddle.framework import in_dynamic_mode
+from paddle.tensor.creation import full
+from paddle.framework import core
+from paddle.static import default_main_program
 
 __all__ = []
 
@@ -353,11 +355,11 @@ def interpolate(x,
     if out_shape is not None and scale is not None:
         raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
-        if isinstance(out_shape, Variable) and not in_dygraph_mode():
+        if isinstance(out_shape, Variable) and not in_dynamic_mode():
             out_shape.stop_gradient = True
             inputs['OutSize'] = out_shape
         else:
-            if in_dygraph_mode():
+            if in_dynamic_mode():
                 if isinstance(out_shape, Variable):
                     out_shape = list(out_shape.numpy())
                 for i, dim in enumerate(out_shape):
@@ -428,7 +430,7 @@ def interpolate(x,
                     attrs['out_w'] = out_shape[2]
 
     else:
-        if in_dygraph_mode() and isinstance(scale, Variable):
+        if in_dynamic_mode() and isinstance(scale, Variable):
             scale = list(scale.numpy())
         if isinstance(scale, Variable):
             scale.stop_gradient = True
@@ -454,7 +456,7 @@ def interpolate(x,
                 "Attr(scale)'s type should be float, int, list, tuple, or Tensor."
             )
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attr_list = []
         for k, v in attrs.items():
             attr_list.append(k)
@@ -719,7 +721,7 @@ def bilinear(x1, x2, weight, bias=None, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.bilinear_tensor_product(x1, x2, weight, bias)
 
     check_variable_and_dtype(x1, 'x1', ['float32', 'float64'], 'bilinear')
@@ -891,7 +893,7 @@ def dropout(x,
         seed = None
         mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
 
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             if default_main_program().random_seed != 0:
                 seed = default_main_program().random_seed
             out, mask = _C_ops.dropout(
@@ -930,7 +932,7 @@ def dropout(x,
             attrs=attrs)
         return out
     else:  #sometimes called dropout_nd #TODO: optimize with c++
-        if not in_dygraph_mode():
+        if not in_dynamic_mode():
             check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'dropout')
         dtype = x.dtype
         keep_prob = 1 - p
@@ -943,7 +945,7 @@ def dropout(x,
 
             #get mask shape
             input_shape = x.shape
-            if not in_dygraph_mode():
+            if not in_dynamic_mode():
                 input_shape_tensor = paddle.shape(x)
             drop_axes = [axis] if isinstance(axis, int) else list(axis)
             if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
@@ -954,7 +956,7 @@ def dropout(x,
                     "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}".
                     format(len(input_shape), len(drop_axes)))
             mask_shape = [1] * len(input_shape)
-            if not in_dygraph_mode():
+            if not in_dynamic_mode():
                 for i in drop_axes:
                     mask_shape[i] = input_shape_tensor[i]
             else:
@@ -964,7 +966,7 @@ def dropout(x,
             #get mask
             random_tensor = paddle.uniform(
                 mask_shape, dtype='float32', min=0., max=1.0)
-            p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+            p = full(shape=[1], fill_value=p, dtype='float32')
             keep_mask = paddle.greater_equal(random_tensor, p)
 
             scale_input = paddle.cast(scale_input, dtype)
@@ -1122,7 +1124,7 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
     if p < 0 or p > 1:
         raise ValueError("p argument should between 0 and 1")
 
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'alpha_dropout')
 
@@ -1142,16 +1144,15 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
         #get mask
         random_tensor = paddle.uniform(
             input_shape, dtype='float32', min=0., max=1.0)
-        p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+        p = full(shape=[1], fill_value=p, dtype='float32')
         keep_mask = paddle.greater_equal(random_tensor, p)
         keep_mask = paddle.cast(keep_mask, dtype)
         drop_mask = paddle.subtract(
-            layers.fill_constant(
-                shape=input_shape, dtype=dtype, value=1.),
-            keep_mask)
+            full(
+                shape=input_shape, fill_value=1., dtype=dtype), keep_mask)
 
         #apply mask
-        b = layers.fill_constant(shape=[1], dtype=dtype, value=b)
+        b = full(shape=[1], fill_value=b, dtype=dtype)
         y = paddle.add(paddle.multiply(x, keep_mask),
                        paddle.scale(
                            drop_mask, scale=alpha_p))
@@ -1347,7 +1348,7 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
                 unsqueezed_dim = [1]
                 x = unsqueeze(x, axis=unsqueezed_dim)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if isinstance(pad, Variable):
             pad = pad.numpy()
         out = _C_ops.pad3d(x, "paddings", pad, "mode", mode, "value", value,
@@ -1519,7 +1520,7 @@ def linear(x, weight, bias=None, name=None):
           #     [0.9440598  0.9440598  0.9440598  0.9440598 ]
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pre_bias = _C_ops.matmul_v2(x, weight, 'trans_x', False, 'trans_y',
                                     False)
 
@@ -1614,7 +1615,7 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
     if epsilon > 1. or epsilon < 0.:
         raise ValueError("The value of epsilon must be between 0 and 1.")
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.label_smooth(label, prior_dist, 'epsilon', float(epsilon))
 
     check_variable_and_dtype(label, 'label', ['float32', 'float64'],
@@ -1765,7 +1766,7 @@ def class_center_sample(label, num_classes, num_samples, group=None):
     if (seed is None or seed == 0) and default_main_program().random_seed != 0:
         seed = default_main_program().random_seed
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         remapped_label, sampled_class_center = _C_ops.class_center_sample(
             label, 'num_classes', num_classes, 'num_samples', num_samples,
             'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed',
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 31cb91bc93b48f243428fb8f7048422e6d74c9d7..f7d765d8541164bfe1ecb26436cf22a88a19fc14 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -16,9 +16,8 @@ from paddle.fluid.framework import _global_flags
 
 import numpy as np
 from ...device import get_cudnn_version
-from ...fluid.framework import in_dygraph_mode
 from ...static import Variable
-from ...fluid import core, dygraph_utils, get_flags
+from ...fluid import dygraph_utils
 from ...fluid.layers.utils import convert_to_list, _is_symmetric_padding
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...framework import ParamAttr
@@ -27,6 +26,11 @@ from paddle import _C_ops
 from ...tensor.manipulation import unsqueeze, squeeze
 from ...tensor.math import add
 from ...fluid.layers import nn
+from paddle.device import is_compiled_with_cuda
+from paddle.device import is_compiled_with_rocm
+from paddle.device import is_compiled_with_npu
+from paddle import in_dynamic_mode
+from paddle import get_flags
 
 __all__ = []
 
@@ -114,7 +118,7 @@ def _conv_nd(x,
              name=None):
 
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
                  use_mkldnn, 'fuse_relu_before_depthwise_conv', False,
@@ -342,13 +346,13 @@ def conv1d(x,
     l_type = "conv2d"
 
     # When "groups==num_channels and num_filters% num_channels == 0" using depthwise_conv2d has better performance
-    if (core.is_compiled_with_cuda() and num_channels == groups and
+    if (is_compiled_with_cuda() and num_channels == groups and
             num_channels != 1 and num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if core.is_compiled_with_npu():
+    if is_compiled_with_npu():
         if (num_channels == groups and num_channels == num_filters):
             l_type = 'depthwise_conv2d'
         else:
@@ -357,7 +361,7 @@ def conv1d(x,
     squeeze_aixs = -3 if channel_last else -2
     x = unsqueeze(x, axis=[squeeze_aixs])
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
                  'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
@@ -553,7 +557,7 @@ def conv2d(x,
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (core.is_compiled_with_cuda() and
+    use_cudnn = True if (is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
     use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
@@ -567,20 +571,20 @@ def conv2d(x,
     if (num_channels == groups and num_channels != 1 and
             num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
-        if core.is_compiled_with_rocm():
+        if is_compiled_with_rocm():
             use_cudnn = True
         else:
             use_cudnn = False
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if core.is_compiled_with_npu():
+    if is_compiled_with_npu():
         if (num_channels == groups and num_channels == num_filters):
             l_type = 'depthwise_conv2d'
         else:
             l_type = 'conv2d'
 
-    if (core.is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
-        ["FLAGS_conv2d_disable_cudnn"]):
+    if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")[
+            "FLAGS_conv2d_disable_cudnn"]):
         use_cudnn = False
 
     return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
@@ -815,7 +819,7 @@ def conv1d_transpose(x,
     x = unsqueeze(x, axis=[squeeze_axis])
     weight = unsqueeze(weight, axis=[-1])
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('output_padding', output_padding, 'output_size', output_size,
                  'strides', stride, 'paddings', padding, 'padding_algorithm',
                  padding_algorithm, 'dilations', dilation, 'groups', groups,
@@ -1026,7 +1030,7 @@ def conv2d_transpose(x,
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (core.is_compiled_with_cuda() and
+    use_cudnn = True if (is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
     # update attrs
@@ -1057,7 +1061,7 @@ def conv2d_transpose(x,
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('output_padding', output_padding, 'output_size', output_size,
                  'strides', stride, 'paddings', padding, 'padding_algorithm',
                  padding_algorithm, 'dilations', dilation, 'groups', groups,
@@ -1242,7 +1246,7 @@ def conv3d(x,
                                                                   groups))
 
     cudnn_version = get_cudnn_version()
-    use_cudnn = True if (core.is_compiled_with_cuda() and
+    use_cudnn = True if (is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
@@ -1458,13 +1462,13 @@ def conv3d_transpose(x,
     cudnn_version = get_cudnn_version()
 
     #TODO(LielinJiang): whether to use cudnn according to the version of cudnn
-    use_cudnn = True if (core.is_compiled_with_cuda() and
+    use_cudnn = True if (is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
     op_type = 'conv3d_transpose'
     data_format_ = "NHWC" if channel_last else "NCHW"
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('output_padding', output_padding, 'output_size', output_size,
                  'paddings', padding, "padding_algorithm", padding_algorithm,
                  'strides', stride, 'dilations', dilation, 'groups', groups,
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index bccb7bc7334fb04105f0c79fcd826e9a8624192c..6a8686b612e7f3dc1feecc2bcc905bcb40b212a4 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -17,12 +17,12 @@
 import numpy as np
 from ...fluid.data_feeder import check_dtype
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode
 from ...static import Variable
 from ...tensor.creation import assign
-from ...fluid import core, dygraph_utils
+from ...fluid import dygraph_utils
 from ...fluid.layers.layer_function_generator import templatedoc
-from ...fluid.layers.sequence_lod import sequence_mask
+from ...fluid.layers.sequence_lod import sequence_mask  #noqa: F401
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -125,7 +125,7 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
                "dim1 and dim2 cannot be the same dimension." \
                 "But received dim1 = %d, dim2 = %d\n"%(dim1, dim2)
 
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         __check_input(input, offset, dim1, dim2)
     helper = LayerHelper("diag_embed", **locals())
 
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index f71d3001f6f3b426e0db1fb36733beaceff3b849..de8a7ff6d3c7b6cd87d6301f2cd0bb7af119a74d 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -14,12 +14,11 @@
 
 from __future__ import print_function
 import warnings
-from ...fluid.framework import in_dygraph_mode
 from ...static import Variable
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.layers import core
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -87,7 +86,7 @@ def one_hot(x, num_classes, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.one_hot_v2(x, 'depth', num_classes, 'allow_out_of_range',
                                  False)
     else:
@@ -196,7 +195,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         raise ValueError("padding_idx must be within [-{}, {})".format(
             weight.shape[0], weight.shape[0]))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.lookup_table_v2(
             weight, x, 'is_sparse', sparse, 'is_distributed', False,
             'remote_prefetch', False, 'padding_idx', padding_idx)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 8dc040325934f42eca30960fcd70abdfe87a11c9..e59ef5ebfb0ab26c16c78933733bc11c0c4148d0 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -14,15 +14,12 @@
 # limitations under the License.
 
 import paddle
-from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype
-import paddle.fluid as fluid
 
 # TODO: define loss functions of neural network
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.layers.nn import _elementwise_op_in_dygraph
 from ...fluid.layers import dice_loss  # noqa: F401
 from ...fluid.layers import log_loss  # noqa: F401
@@ -34,11 +31,12 @@ from ...fluid.layers import square_error_cost  # noqa: F401
 from ...fluid.layers import edit_distance  # noqa: F401
 from ...fluid.layers import huber_loss
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode
 from ...fluid.framework import _varbase_creator
 from ...static import Variable
 from paddle.utils import deprecated
 from paddle import _C_ops
+from paddle import in_dynamic_mode
+from paddle.framework import core
 
 __all__ = []
 
@@ -115,7 +113,7 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             "'mean' or 'none', but received %s, which is not allowed." %
             reduction)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         out = _C_ops.bce_loss(input, label)
         if weight is not None:
             out = _C_ops.elementwise_mul(out, weight, 'axis', -1)
@@ -133,7 +131,7 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
     fluid.data_feeder.check_variable_and_dtype(
         label, 'label', ['float32', 'float64'], 'binary_cross_entropy')
 
-    sub_name = name if weight is None and reduction is 'none' else None
+    sub_name = name if weight is None and reduction == 'none' else None
     helper = LayerHelper("binary_cross_entropy", name=sub_name)
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
@@ -146,7 +144,7 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
 
     if weight is not None:
         if isinstance(weight, paddle.static.Variable):
-            weight_name = name if reduction is 'none' else None
+            weight_name = name if reduction == 'none' else None
             out = paddle.multiply(out, weight, name=weight_name)
         else:
             raise ValueError(
@@ -249,7 +247,7 @@ def binary_cross_entropy_with_logits(logit,
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
             % reduction)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         one = _varbase_creator(dtype=logit.dtype)
         _C_ops.fill_constant(one, 'value',
                              float(1.0), 'force_cpu', False, 'dtype', one.dtype,
@@ -284,8 +282,7 @@ def binary_cross_entropy_with_logits(logit,
     out = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(
         logit, label, name=sigmoid_name)
 
-    one = paddle.fluid.layers.fill_constant(
-        shape=[1], value=1.0, dtype=logit.dtype)
+    one = paddle.full(shape=[1], fill_value=1.0, dtype=logit.dtype)
     if pos_weight is not None:
         fluid.data_feeder.check_variable_and_dtype(
             pos_weight, 'pos_weight', ['float32', 'float64'],
@@ -392,7 +389,7 @@ def hsigmoid_loss(input,
             #  [2.2407534]]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         out, _, _ = _C_ops.hierarchical_sigmoid(
             input, weight, label, path_table, path_code, bias, 'num_classes',
             num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
@@ -569,7 +566,7 @@ def margin_ranking_loss(input,
         raise ValueError(
             "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
             "received %s, which is not allowed." % reduction)
-    if fluid.framework.in_dygraph_mode():
+    if in_dynamic_mode():
         out = _C_ops.elementwise_sub(other, input)
         out = _C_ops.elementwise_mul(out, label)
         if margin != 0.0:
@@ -595,8 +592,7 @@ def margin_ranking_loss(input,
 
     if margin != 0.0:
         margin_var = out.block.create_var(dtype=out.dtype)
-        paddle.fluid.layers.fill_constant(
-            [1], out.dtype, margin, out=margin_var)
+        margin_var = paddle.full(shape=[1], fill_value=margin, dtype=out.dtype)
         out = paddle.add(out, margin_var)
 
     result_out = helper.create_variable_for_type_inference(input.dtype)
@@ -686,7 +682,7 @@ def l1_loss(input, label, reduction='mean', name=None):
             "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
             "received %s, which is not allowed." % reduction)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         unreduced = _elementwise_op_in_dygraph(
             input, label, axis=-1, act='abs', op_name='elementwise_sub')
         if reduction == 'mean':
@@ -776,7 +772,7 @@ def nll_loss(input,
             input_dims))
     n = input_shape[0]
     c = input_shape[1]
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if input_dims != 2 and input_dims != 4:
             input, _ = _C_ops.reshape2(input, None, 'shape', [n, c, 1, -1])
             label, _ = _C_ops.reshape2(label, None, 'shape', [n, 1, -1])
@@ -995,7 +991,7 @@ def mse_loss(input, label, reduction='mean', name=None):
             "'reduction' in 'mse_loss' should be 'sum', 'mean' or 'none', "
             "but received {}.".format(reduction))
 
-    if not paddle.fluid.framework.in_dygraph_mode():
+    if not in_dynamic_mode():
         paddle.fluid.data_feeder.check_variable_and_dtype(
             input, 'input', ['float32', 'float64'], 'mse_loss')
         paddle.fluid.data_feeder.check_variable_and_dtype(
@@ -1099,7 +1095,7 @@ def ctc_loss(log_probs,
     loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
                                     input_lengths, label_lengths)
 
-    loss_out = fluid.layers.squeeze(loss_out, [-1])
+    loss_out = paddle.squeeze(loss_out, [-1])
     assert reduction in ['mean', 'sum', 'none']
     if reduction == 'mean':
         loss_out = paddle.mean(loss_out / label_lengths)
@@ -1117,7 +1113,7 @@ def margin_cross_entropy(logits,
                          group=None,
                          return_softmax=False,
                          reduction='mean'):
-    """
+    r"""
     .. math::
 
         L=-\\frac{1}{N}\sum^N_{i=1}\log\\frac{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\\neq y_i} e^{scos\\theta_{y_i}}}
@@ -1319,7 +1315,7 @@ def margin_cross_entropy(logits,
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=-1)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         softmax, loss = _C_ops.margin_cross_entropy(
             logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks,
             'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale',
@@ -1664,7 +1660,7 @@ def cross_entropy(input,
              (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if soft_label == False:
             valid_label = paddle.cast(
                 label != ignore_index, dtype=label.dtype) * label
@@ -1978,7 +1974,7 @@ def sigmoid_focal_loss(logit,
                 "Expected one dimension of normalizer in sigmoid_focal_loss but got {}.".
                 format(normalizer_dims))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         one = _varbase_creator(dtype=logit.dtype)
         _C_ops.fill_constant(one, 'value',
                              float(1.0), 'force_cpu', False, 'dtype', one.dtype,
@@ -2025,7 +2021,7 @@ def sigmoid_focal_loss(logit,
     loss = paddle.nn.functional.binary_cross_entropy_with_logits(
         logit, label, reduction='none', name=bce_name)
 
-    pred = fluid.layers.sigmoid(logit)
+    pred = paddle.nn.functional.sigmoid(logit)
     p_t = pred * label + (1 - pred) * (1 - label)
 
     alpha_t = alpha * label + (1 - alpha) * (1 - label)
@@ -2125,7 +2121,7 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
             "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
             "but received {}.".format(reduction))
 
-    if not paddle.fluid.framework.in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'hinge_embedding_loss')
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 9b765a1d7c78243f13bda15f2bbbbd485ee7432a..a5de268ec2314a6e7ff8014c9ce1bb8efa598b2b 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -17,13 +17,13 @@ import paddle
 import paddle.fluid as fluid
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode, core
 from ...framework import create_parameter
 from ..initializer import Constant
 from ...framework import ParamAttr
-from ...fluid import core, dygraph_utils
+from ...fluid import dygraph_utils
 import numbers
 from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -78,7 +78,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
             # [[0.         0.24253564 0.37139067]
             # [1.         0.97014254 0.9284767 ]]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
         out = _C_ops.p_norm(x, 'axis', axis, 'porder',
                             float(p), 'keepdim', True, 'epsilon', epsilon)
@@ -104,7 +104,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     helper.append_op(
         type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
     eps = out.block.create_var(dtype=out.dtype)
-    paddle.fluid.layers.fill_constant([1], out.dtype, epsilon, out=eps)
+    eps = paddle.full(shape=[1], fill_value=epsilon, dtype=out.dtype)
     return paddle.divide(x, paddle.maximum(out, eps), name=name)
 
 
@@ -180,7 +180,7 @@ def batch_norm(x,
     else:
         trainable_statistics = not use_global_stats
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         # for dygraph need tuple
         attrs = ("momentum", momentum, "epsilon", epsilon, "is_test",
                  not training, "data_layout", data_format, "use_mkldnn", False,
@@ -217,7 +217,7 @@ def batch_norm(x,
 
     helper = LayerHelper('batch_norm', **locals())
 
-    param_dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    param_dtype = x.dtype if x.dtype != 'float16' else 'float32'
     saved_mean = helper.create_variable_for_type_inference(
         dtype=param_dtype, stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
@@ -301,7 +301,7 @@ def layer_norm(x,
                          str_normalized_shape[
                              1:] + ', but got input shape ' + str(input_shape))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pre_act, _, _ = _C_ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
                                           'begin_norm_axis', begin_norm_axis)
         return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
@@ -385,7 +385,7 @@ def instance_norm(x,
 
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         out, _, _ = _C_ops.instance_norm(x, weight, bias, "epsilon", eps,
                                          "momentum", momentum, "data_format",
                                          data_format)
@@ -474,7 +474,7 @@ def local_response_norm(x,
             y = paddle.nn.functional.local_response_norm(x, size=5)
             print(y.shape)  # [3, 3, 112, 112]
         """
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32'], 'local_response_norm')
     if data_format not in ['NCL', 'NLC', 'NCHW', 'NHWC', 'NCDHW', 'NDHWC']:
         raise ValueError(
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 01ddf05fb82d29c04f03c89b89c3ebeb53e79c39..34a0159fbb0dc4b16dadcd075d450102648d956b 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -13,13 +13,11 @@
 # limitations under the License.
 
 # TODO: define pooling functions
-from ...fluid import core
-from ...fluid.framework import in_dygraph_mode
 from ...fluid.layers import utils, LayerHelper
 from ...tensor.manipulation import unsqueeze, squeeze
 from ...fluid.data_feeder import check_type, check_variable_and_dtype
 from paddle import _C_ops
-from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -210,7 +208,7 @@ def avg_pool1d(x,
     """
     """NCL to NCHW"""
     data_format = "NCHW"
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool1d')
     _check_input(x, 3)
     x = unsqueeze(x, [2])
@@ -232,7 +230,7 @@ def avg_pool1d(x,
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool2d(
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
@@ -346,7 +344,7 @@ def avg_pool2d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, 2, channel_last, ceil_mode=ceil_mode)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize', kernel_size,
                                'global_pooling', False, 'padding_algorithm',
                                padding_algorithm, 'strides', stride, 'paddings',
@@ -468,7 +466,7 @@ def avg_pool3d(x,
     _check_value_limitation(kernel_size, "kernel_size", min_limit=1e-3)
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool3d(
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
             'paddings', padding, 'global_pooling', False, 'padding_algorithm',
@@ -571,7 +569,7 @@ def max_pool1d(x,
     """
     """NCL to NCHW"""
     data_format = "NCHW"
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool1d')
     _check_input(x, 3)
     x = unsqueeze(x, [2])
@@ -587,7 +585,7 @@ def max_pool1d(x,
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if return_mask:
             pool_out = _C_ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
@@ -672,7 +670,7 @@ def max_unpool1d(x,
                  data_format="NCL",
                  output_size=None,
                  name=None):
-    """
+    r"""
     This API implements max unpooling 1d opereation.
     `max_unpool1d` accepts the output of `max_pool1d` as input, 
     including the indices of the maximum value and calculate the partial inverse. 
@@ -746,7 +744,7 @@ def max_unpool1d(x,
     output_size = _unpool_output_size(x, kernel_size, stride, padding,
                                       output_size)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize',
                                kernel_size, 'strides', stride, 'paddings',
                                padding, "output_size", output_size,
@@ -781,7 +779,7 @@ def max_unpool2d(x,
                  data_format="NCHW",
                  output_size=None,
                  name=None):
-    """
+    r"""
     This API implements max unpooling 2d opereation.
     See more details in :ref:`api_nn_pooling_MaxUnPool2D` .
 
@@ -861,7 +859,7 @@ def max_unpool2d(x,
     output_size = _unpool_output_size(x, kernel_size, stride, padding,
                                       output_size)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize',
                                kernel_size, 'strides', stride, 'paddings',
                                padding, "output_size", output_size,
@@ -896,7 +894,7 @@ def max_unpool3d(x,
                  data_format="NCDHW",
                  output_size=None,
                  name=None):
-    """
+    r"""
     This API implements max unpooling 3d opereation.
     `max_unpool3d` accepts the output of `max_pool3d` as input, 
     including the indices of the maximum value and calculate the partial inverse. 
@@ -973,7 +971,7 @@ def max_unpool3d(x,
     output_size = _unpool_output_size(x, kernel_size, stride, padding,
                                       output_size)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.unpool3d(x, indices, 'unpooling_type', 'max', 'ksize',
                                  kernel_size, 'strides', stride, 'paddings',
                                  padding, "output_size", output_size,
@@ -1029,7 +1027,7 @@ def max_pool2d(x,
             "When setting return_mask to true, data_format must be set to NCHW in API:max_pool2d"
         )
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if return_mask:
             output = _C_ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
@@ -1160,7 +1158,7 @@ def max_pool3d(x,
             "When setting return_mask to true, data_format must be set to NCDHW in API:max_pool3d"
         )
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if return_mask:
             output = _C_ops.max_pool3d_with_index(
                 x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
@@ -1250,7 +1248,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
               # pool_out shape: [1, 3, 16])
     """
     pool_type = 'avg'
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                                  'adaptive_pool2d')
         check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
@@ -1258,7 +1256,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
     x = unsqueeze(x, [2])
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pool_out = _C_ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
                                  pool_size, 'adaptive', True)
         return squeeze(pool_out, [2])
@@ -1333,7 +1331,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
                             output_size=[3, 3])
             # out.shape is [2, 3, 3, 3]
     """
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                                  'adaptive_avg_pool2d')
         check_type(data_format, 'data_format', str, 'adaptive_avg_pool2d')
@@ -1357,7 +1355,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
         if output_size[1] == None:
             output_size[1] = in_w
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize', output_size,
                                'global_pooling', False, 'adaptive', True,
                                'data_format', data_format)
@@ -1437,7 +1435,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
                             output_size=[3, 3, 3])
             # out.shape is [2, 3, 3, 3, 3]
     """
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'adaptive_avg_pool3d')
         check_type(data_format, 'data_format', str, 'adaptive_avg_pool3d')
@@ -1463,7 +1461,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
         if output_size[2] == None:
             output_size[2] = in_w
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool3d(x, 'pooling_type', 'avg', 'ksize', output_size,
                                'global_pooling', False, 'adaptive', True,
                                'data_format', data_format)
@@ -1537,7 +1535,7 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
               # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
     """
     pool_type = 'max'
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'adaptive_max_pool1d')
         check_type(output_size, 'pool_size', int, 'adaptive_max_pool1d')
@@ -1547,7 +1545,7 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
     x = unsqueeze(x, [2])
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pool_out = _C_ops.max_pool2d_with_index(
             x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
         return (squeeze(pool_out[0], [2]), squeeze(
@@ -1619,7 +1617,7 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
                             output_size=[3, 3])
               # out.shape is [2, 3, 3, 3]
     """
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'adaptive_max_pool2d')
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool2d')
@@ -1636,7 +1634,7 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
         if output_size[1] == None:
             output_size[1] = in_w
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pool_out = _C_ops.max_pool2d_with_index(
             x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
         return pool_out if return_mask else pool_out[0]
@@ -1710,7 +1708,7 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
               # out.shape is [2, 3, 3, 3, 3]
     """
 
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'adaptive_max_pool3d')
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool3d')
@@ -1729,7 +1727,7 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
         if output_size[2] == None:
             output_size[2] = in_w
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pool_out = _C_ops.max_pool3d_with_index(
             x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
         return pool_out if return_mask else pool_out[0]
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
index c39fcb8554a2f6e64610e3135043f86c7943e620..53be014527815295209f147153e9335fc74c33a1 100644
--- a/python/paddle/nn/functional/sparse_attention.py
+++ b/python/paddle/nn/functional/sparse_attention.py
@@ -14,10 +14,10 @@
 
 import warnings
 import paddle
-from ...fluid.framework import in_dygraph_mode, default_main_program
+from ...fluid.framework import default_main_program
 from paddle.fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode
 from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 
 def sparse_attention(query,
@@ -143,7 +143,7 @@ def sparse_attention(query,
             #       [1.60885942, 2.60885954],
             #       [1.99830270, 2.99830270]]]]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         result_attention, result_sdd, result_softmax = _C_ops.sparse_attention(
             query, key, value, sparse_csr_offset, sparse_csr_columns,
             key_padding_mask, attn_mask)
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index bd3e27a25e12c47abd22282d1c72e14910b1188c..43c7757a8777ba703194cce989b2469d0fc900e0 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 from ...device import get_cudnn_version
-from ...fluid.framework import core, in_dygraph_mode
 from ...static import Variable
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid import dygraph_utils
 import numpy as np
 from paddle import _C_ops
+from ...device import is_compiled_with_rocm
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -83,14 +84,14 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
         use_cudnn = True
     else:
         use_cudnn = False
-    if core.is_compiled_with_rocm():
+    if is_compiled_with_rocm():
         use_cudnn = False  # ROCM platform do not have MIOPEN kernel for affine_grid
 
     if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
             isinstance(out_shape, Variable)):
         raise ValueError("The out_shape should be a list, tuple or Tensor.")
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         _out_shape = out_shape.numpy().tolist() if isinstance(
             out_shape, Variable) else out_shape
         return _C_ops.affine_grid(theta, "output_shape", _out_shape,
@@ -263,7 +264,7 @@ def grid_sample(x,
 
     cudnn_version = get_cudnn_version()
     use_cudnn = False
-    if not core.is_compiled_with_rocm() and (
+    if not is_compiled_with_rocm() and (
             cudnn_version is not None
     ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
         use_cudnn = True
@@ -271,7 +272,7 @@ def grid_sample(x,
         x.stop_gradient = False
         grid.stop_gradient = False
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
                  align_corners, 'use_cudnn', use_cudnn)
         out = getattr(_C_ops, 'grid_sampler')(x, grid, *attrs)
@@ -329,7 +330,7 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
                          "But recevie Attr(data_format): {} ".format(
                              data_format))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
                                     "data_format", data_format)
 
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index 13a70a179ffe38e2f2ef4b335657d7e45d1bb84a..746d2b67b2a1d396b377b9f6edbcd6ee233eaabd 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -11,11 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from ...fluid import framework
-from ...fluid import core
-from ...fluid import unique_name
-from ...fluid.core import VarDesc
+import paddle
 from ...fluid.data_feeder import check_type
 from ...fluid.initializer import NumpyArrayInitializer
 
@@ -88,13 +84,14 @@ class Assign(NumpyArrayInitializer):
     def __init__(self, value, name=None):
         import numpy
         check_type(value, 'value',
-                   (numpy.ndarray, list, tuple, framework.Variable), 'Assign')
+                   (numpy.ndarray, list, tuple, paddle.static.Variable),
+                   'Assign')
 
         if (isinstance(value, (list, tuple))):
             value = numpy.array(value)
 
         # TODO: value is already is a tensor, accounting efficiency maybe it does not need to convert tensor to numpy data and then initialized.
-        if (isinstance(value, framework.Variable)):
+        if (isinstance(value, paddle.static.Variable)):
             value = value.numpy()
 
         super(Assign, self).__init__(value)
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index 26aa349b5b1b4b7f64d555c46fb00347a636fe50..da3266ab3369480cabd954166f55f69c65febb9c 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -15,13 +15,15 @@
 from ...fluid.initializer import Initializer
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.core import VarDesc
-from ...fluid import unique_name, framework
+from ...fluid import framework
+from paddle import in_dynamic_mode
+from paddle.utils import unique_name
 
 __all__ = []
 
 
 class Dirac(Initializer):
-    """Initialize the 3D/4D/5D Tensor with Dirac delta function.
+    r"""Initialize the 3D/4D/5D Tensor with Dirac delta function.
     
     It can reserve the feature of convolution layer input, which means that
     as many channels are reserved as possible.
@@ -221,6 +223,6 @@ class Dirac(Initializer):
                        "out_dtype": var.dtype},
                 stop_gradient=True)
 
-        if not framework.in_dygraph_mode():
+        if not in_dynamic_mode():
             var.op = op
         return op
diff --git a/python/paddle/nn/initializer/orthogonal.py b/python/paddle/nn/initializer/orthogonal.py
index 8e0acb9ab2d20ffa1fd0647f54884f303c837ba6..84cdb971d77d46ddea36eeda03200010d22f3568 100644
--- a/python/paddle/nn/initializer/orthogonal.py
+++ b/python/paddle/nn/initializer/orthogonal.py
@@ -14,9 +14,9 @@
 
 from ...fluid.initializer import Initializer
 from ...fluid.data_feeder import check_variable_and_dtype
-from ...fluid.core import VarDesc
-from ...fluid import unique_name, framework
+from ...fluid import framework
 from ...tensor import diag, transpose, sign, qr, reshape
+from paddle.utils import unique_name
 
 __all__ = []
 
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 617981cb8f74c07a8d0944a2b655d66b3d13fbd4..400585c43183038bfbe68f5dd0e241044d06f4bb 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -14,8 +14,6 @@
 
 # TODO: define activation functions of neural network
 
-from ...fluid import core
-from ...fluid.framework import in_dygraph_mode
 from ...framework import ParamAttr
 from ..initializer import Constant
 from paddle.framework import get_default_dtype
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 89ff156bded2af2ea22cbb85ee35b2def011dbe9..19fbcd5b6f85691e57530a442d9f72ce7935692d 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,10 +15,10 @@
 # TODO: define the common classes to build a neural network
 import paddle
 from ...fluid.dygraph import Flatten  # noqa: F401
-from ...fluid.framework import in_dygraph_mode
 from .. import functional as F
 from ...fluid.framework import _dygraph_tracer
 from paddle.nn import Layer
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -1456,7 +1456,7 @@ class Embedding(Layer):
             dtype=self._dtype,
             is_bias=False)
 
-        if in_dygraph_mode() and padding_idx != -1:
+        if in_dynamic_mode() and padding_idx != -1:
             with paddle.no_grad():
                 self.weight[padding_idx] = 0.0
 
@@ -1554,7 +1554,7 @@ class Unfold(Layer):
 
 
 class Fold(Layer):
-    """
+    r"""
 
     This Op is used to combines an array of sliding local blocks into a large containing
     tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 26fd544ecce11234301b948f91128a4e6c052210..bb1cbbfc03e550f0d57c7dc30aecfa6e12ce2f75 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -16,14 +16,15 @@
 
 import numpy as np
 
-from ...fluid import get_flags
-from ...fluid import core
+from paddle import get_flags
 from ...device import get_cudnn_version
 from .. import Layer
 from ..initializer import Normal
 from .. import functional as F
 from ...fluid.layers import utils
 from ..functional.conv import _update_padding_nd
+from ...device import is_compiled_with_cuda
+from ...device import is_compiled_with_rocm
 
 __all__ = []
 
@@ -138,7 +139,7 @@ class _ConvNd(Layer):
 
         cudnn_version = get_cudnn_version()
 
-        self._use_cudnn = True if (core.is_compiled_with_cuda() and
+        self._use_cudnn = True if (is_compiled_with_cuda() and
                                    cudnn_version is not None) else False
 
         self._op_type = "conv" + str(dims) + 'd'
@@ -146,13 +147,13 @@ class _ConvNd(Layer):
                                           in_channels != 1 and
                                           out_channels % in_channels == 0):
             self._op_type = 'depthwise_conv2d'
-            if core.is_compiled_with_rocm():
+            if is_compiled_with_rocm():
                 self._use_cudnn = True
             else:
                 self._use_cudnn = False
 
-        if (core.is_compiled_with_cuda() and get_flags(
-                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+        if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")[
+                "FLAGS_conv2d_disable_cudnn"]):
             self._use_cudnn = False
 
     def extra_repr(self):
@@ -161,7 +162,7 @@ class _ConvNd(Layer):
             main_str += ', stride={_stride}'
         if self._padding != 0:
             main_str += ', padding={_padding}'
-        if self._padding_mode is not 'zeros':
+        if self._padding_mode != 'zeros':
             main_str += ', padding_mode={_padding_mode}'
         if self.output_padding != 0:
             main_str += ', output_padding={output_padding}'
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 0547bf75a4bf6c4b2b4a878fdf37f00c007ef4bc..1fb7e8c4f2148f7dc0d387d2a7ae44aab604c703 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -16,10 +16,10 @@ import numpy as np
 
 import paddle
 from .. import Layer
-from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
 from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -78,7 +78,7 @@ class PairwiseDistance(Layer):
         check_type(self.keepdim, 'keepdim', (bool), 'PairwiseDistance')
 
     def forward(self, x, y):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             sub = _C_ops.elementwise_sub(x, y)
             return _C_ops.p_norm(sub, 'axis', 1, 'porder', self.p, 'keepdim',
                                  self.keepdim, 'epsilon', self.epsilon)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 9da41f26969c8064c7147b17f4f73f6e5bc7bead..7e40c029a02ecb1a8b263a2133c2d36269282d86 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -16,11 +16,11 @@
 # TODO: define loss functions of neural network
 import numpy as np
 import paddle.fluid as fluid
-import paddle.fluid.core as core
 import paddle
 from .. import functional as F
-from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
+from paddle.fluid.framework import _varbase_creator
 from .. import Layer
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -591,7 +591,7 @@ class MSELoss(Layer):
         self.reduction = reduction
 
     def forward(self, input, label):
-        if not fluid.framework.in_dygraph_mode():
+        if not in_dynamic_mode():
             fluid.data_feeder.check_variable_and_dtype(
                 input, 'input', ['float32', 'float64'], 'MSELoss')
             fluid.data_feeder.check_variable_and_dtype(
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index de9f8663e67692f8c23d378abb8e1f7171943d95..7c3e3ad8dee9f66bffe07249e17a64cc4d8fa513 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -33,12 +33,11 @@ from ...fluid.dygraph import BatchNorm  # noqa: F401
 from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
 from ...framework import get_default_dtype, set_default_dtype
-from ...fluid.framework import in_dygraph_mode
 
 from ..initializer import Constant
 from ...framework import ParamAttr
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
-from ...fluid import core, dygraph_utils
+from ...fluid import dygraph_utils
 
 from ..functional import batch_norm, layer_norm, instance_norm
 
@@ -49,6 +48,7 @@ from ...framework import no_grad
 from .. import functional as F
 from paddle import _C_ops
 from .. import Layer
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -668,7 +668,7 @@ class _BatchNormBase(Layer):
     def extra_repr(self):
         main_str = 'num_features={}, momentum={}, epsilon={}'.format(
             self._num_features, self._momentum, self._epsilon)
-        if self._data_format is not 'NCHW':
+        if self._data_format != 'NCHW':
             main_str += ', data_format={}'.format(self._data_format)
         if self._name is not None:
             main_str += ', name={}'.format(self._name)
@@ -1087,7 +1087,7 @@ class SyncBatchNorm(_BatchNormBase):
 
         ### train mode: use mini-batch stats, eval mode: use global stats
         ### use_global_stats only support False in sync_batch_norm
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                      "is_test", not self.training, "data_layout",
                      self._data_format, "use_mkldnn", False, "fuse_with_relu",
@@ -1252,7 +1252,7 @@ class LocalResponseNorm(Layer):
     def extra_repr(self):
         main_str = 'size={}, alpha={}, beta={}, k={}'.format(
             self.size, self.alpha, self.beta, self.k)
-        if self.data_format is not 'NCHW':
+        if self.data_format != 'NCHW':
             main_str += ', data_format={}'.format(self.data_format)
         if self.name is not None:
             main_str += ', name={}'.format(self.name)
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 96942f5c8500a0a859e2c73b6557b0604258a127..c664c6e318c46f8ddaed4370992eb1a49a531119 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -1131,7 +1131,7 @@ class AdaptiveMaxPool3D(Layer):
 
 
 class MaxUnPool1D(Layer):
-    """
+    r"""
     This API implements max unpooling 1d opereation.
 
     `max_unpool1d` accepts the output of `max_pool1d` as input, 
@@ -1213,7 +1213,7 @@ class MaxUnPool1D(Layer):
 
 
 class MaxUnPool2D(Layer):
-    """
+    r"""
     This API implements max unpooling 2d opereation.
 
     'max_unpool2d' accepts the output of 'max_unpool2d' as input
@@ -1257,9 +1257,8 @@ class MaxUnPool2D(Layer):
         
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
-        data = paddle.rand(shape=[1,1,7,7])
+        data = paddle.rand(shape=[1,1,6,6])
         pool_out, indices = F.max_pool2d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
         # pool_out shape: [1, 1, 3, 3],  indices shape: [1, 1, 3, 3]
         Unpool2D = paddle.nn.MaxUnPool2D(kernel_size=2, padding=0)
@@ -1299,7 +1298,7 @@ class MaxUnPool2D(Layer):
 
 
 class MaxUnPool3D(Layer):
-    """
+    r"""
     This API implements max unpooling 3d opereation.
 
     `max_unpool3d` accepts the output of `max_pool3d` as input, 
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index f7d5448d1324be02fb749985435466ecf7214ab2..09a0d3cb41cbcb1a867e2e61e37946bf0d059805 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -33,6 +33,11 @@ from paddle.fluid.layers import utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 from paddle.fluid.data_feeder import convert_dtype
 from paddle import _C_ops
+from paddle import in_dynamic_mode
+from paddle.framework import core
+from paddle.static import default_startup_program
+from paddle.static import program_guard
+
 __all__ = []
 
 
@@ -386,7 +391,7 @@ class SimpleRNNCell(RNNCellBase):
 
     def extra_repr(self):
         s = '{input_size}, {hidden_size}'
-        if self.activation is not "tanh":
+        if self.activation != "tanh":
             s += ', activation={activation}'
         return s.format(**self.__dict__)
 
@@ -970,8 +975,8 @@ class RNNBase(LayerList):
             # dropout state may also can be hided and avoid saving
             # should dropout state be persistable for static-graph
             self._dropout_state = self.create_variable(
-                dtype=fluid.core.VarDesc.VarType.UINT8)
-            if fluid.framework.in_dygraph_mode():
+                dtype=core.VarDesc.VarType.UINT8)
+            if in_dynamic_mode():
                 with paddle.no_grad():
                     _C_ops.coalesce_tensor(self._all_weights, self._all_weights,
                                            self._flat_weight[0], "copy_data",
@@ -979,8 +984,8 @@ class RNNBase(LayerList):
                                            params[0].dtype)
                     return
             # for static-graph, append coalesce_tensor into startup program
-            with fluid.program_guard(fluid.default_startup_program(),
-                                     fluid.default_startup_program()):
+            with program_guard(default_startup_program(),
+                               default_startup_program()):
                 with paddle.no_grad():
                     self._helper.append_op(
                         type="coalesce_tensor",
@@ -999,7 +1004,7 @@ class RNNBase(LayerList):
         if not self.time_major:
             inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
 
-        if fluid.framework.in_dygraph_mode():
+        if in_dynamic_mode():
             _, _, out, state = _C_ops.rnn(
                 inputs, initial_states, self._all_weights, sequence_length,
                 self._dropout_state, self.state_components, 'dropout_prob',
@@ -1014,7 +1019,7 @@ class RNNBase(LayerList):
                 for i in range(self.state_components)
             ]
             reserve = self._helper.create_variable_for_type_inference(
-                dtype=fluid.core.VarDesc.VarType.UINT8, stop_gradient=True)
+                dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
 
             inputs = {
                 'Input': inputs,
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index 7f8b51ca10818ec10a794f2910f066f16cf26278..0531afb4eeeeb92c4e888bb2df972e4920b971cd 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -82,7 +82,7 @@ class PixelShuffle(Layer):
 
     def extra_repr(self):
         main_str = 'upscale_factor={}'.format(self._upscale_factor)
-        if self._data_format is not 'NCHW':
+        if self._data_format != 'NCHW':
             main_str += ', data_format={}'.format(self._data_format)
         if self._name is not None:
             main_str += ', name={}'.format(self._name)
diff --git a/python/paddle/nn/quant/functional_layers.py b/python/paddle/nn/quant/functional_layers.py
index ce5fb3e616eb5960da6c2535c10d34d0fbf4766d..2c0eb88e0875c95b7ff832ef947a04d752fa068a 100644
--- a/python/paddle/nn/quant/functional_layers.py
+++ b/python/paddle/nn/quant/functional_layers.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.dygraph import layers
 from ...tensor import math, manipulation
+from .. import Layer
 
 __all__ = []
 
 
-class FloatFunctionalLayer(layers.Layer):
+class FloatFunctionalLayer(Layer):
     def __init__(self):
         super(FloatFunctionalLayer, self).__init__()
 
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 7ad43da6ed5c880ef190164594409d4f38735300..8e9316a19623bd696d549c59347a3399f38a3a70 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.dygraph import layers
-from paddle.fluid import core
+from paddle.framework import core
 from paddle.fluid import dygraph_utils
-from paddle.fluid import unique_name
-from paddle.fluid.param_attr import ParamAttr
+from paddle.utils import unique_name
+from paddle.framework import ParamAttr
 from paddle.fluid.framework import _varbase_creator
-from paddle.fluid.framework import in_dygraph_mode
-from paddle.fluid.initializer import Constant
+from paddle.nn.initializer import Constant
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.nn import functional as F
 import logging
 from paddle.fluid.log_helper import get_logger
 from paddle import _C_ops
+from paddle import in_dynamic_mode
+from paddle.nn import Layer
 
 __all__ = [
     'FakeQuantAbsMax',
@@ -43,7 +43,7 @@ _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
-class FakeQuantAbsMax(layers.Layer):
+class FakeQuantAbsMax(Layer):
     r"""
     FakeQuantAbsMax layer does the abs_max quant and then dequant.
     Its computational formula is described as below:
@@ -76,7 +76,7 @@ class FakeQuantAbsMax(layers.Layer):
             self._scale = None
 
     def forward(self, input):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ('bit_length', self._quant_bits)
             quant_out = _varbase_creator(
                 type=input.type,
@@ -125,7 +125,7 @@ class FakeQuantAbsMax(layers.Layer):
         return quant_out
 
 
-class FakeQuantMovingAverageAbsMax(layers.Layer):
+class FakeQuantMovingAverageAbsMax(Layer):
     r"""
     FakeQuantMovingAverageAbsMax layer does the moving_average_abs_max quant and then dequant.
     Its computational formula is described as below:
@@ -175,7 +175,7 @@ class FakeQuantMovingAverageAbsMax(layers.Layer):
         self._accum.stop_gradient = True
 
     def forward(self, input):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ('moving_rate', self._moving_rate, 'bit_length',
                      self._quant_bits, 'is_test', not self.training)
             quant_out = _varbase_creator(
@@ -223,7 +223,7 @@ class FakeQuantMovingAverageAbsMax(layers.Layer):
         return quant_out
 
 
-class FakeQuantChannelWiseAbsMax(layers.Layer):
+class FakeQuantChannelWiseAbsMax(Layer):
     def __init__(self,
                  name=None,
                  channel_num=None,
@@ -253,7 +253,7 @@ class FakeQuantChannelWiseAbsMax(layers.Layer):
             self._scale = None
 
     def forward(self, input):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ('bit_length', self._quant_bits, 'quant_axis',
                      self._quant_axis)
             quant_out = _varbase_creator(
@@ -306,7 +306,7 @@ class FakeQuantChannelWiseAbsMax(layers.Layer):
         return quant_out
 
 
-class MovingAverageAbsMaxScale(layers.Layer):
+class MovingAverageAbsMaxScale(Layer):
     def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
         r"""
         MovingAverageMaxScale layer is used to calculating the output quantization
@@ -345,7 +345,7 @@ class MovingAverageAbsMaxScale(layers.Layer):
         self._accum.stop_gradient = True
 
     def forward(self, input):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ('moving_rate', self._moving_rate, 'is_test',
                      not self.training)
             state = self._state if self.training else None
@@ -393,7 +393,7 @@ class MovingAverageAbsMaxScale(layers.Layer):
 QuantStub = MovingAverageAbsMaxScale
 
 
-class QuantizedConv2D(layers.Layer):
+class QuantizedConv2D(Layer):
     """
     The computational logic of QuantizedConv2D is the same with Conv2D.
     The only difference is that its inputs are all fake quantized.
@@ -482,7 +482,7 @@ class QuantizedConv2D(layers.Layer):
             data_format=self._data_format)
 
 
-class QuantizedConv2DTranspose(layers.Layer):
+class QuantizedConv2DTranspose(Layer):
     """
     The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
     The only difference is that its inputs are all fake quantized.
@@ -588,7 +588,7 @@ class QuantizedConv2DTranspose(layers.Layer):
             data_format=self._data_format)
 
 
-class QuantizedLinear(layers.Layer):
+class QuantizedLinear(Layer):
     """
     The computational logic of QuantizedLinear is the same with Linear.
     The only difference is that its inputs are all fake quantized.
@@ -657,7 +657,7 @@ class QuantizedLinear(layers.Layer):
         return out
 
 
-class MAOutputScaleLayer(layers.Layer):
+class MAOutputScaleLayer(Layer):
     """
     Add MovingAverageMaxScale layer to the behind of the input layer.
     Calculate the scale (moving average abs max) for the output of the input layer.
@@ -684,7 +684,7 @@ class MAOutputScaleLayer(layers.Layer):
             return self._ma_output_scale(out)
 
 
-class FakeQuantMAOutputScaleLayer(layers.Layer):
+class FakeQuantMAOutputScaleLayer(Layer):
     """
     Add FakeQuantMovingAverageAbsMax layer to the behind of the input layer.
     """
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 8d2cc8062d2ccb408e3407033af924938d3d9aa1..c131d218a1cde9409d18e98fac709a3f85a40a64 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import paddle
 import numpy as np
 from ... import fluid
 from ...fluid import dygraph
@@ -39,25 +39,25 @@ def l2_norm(x, axis, epsilon=1e-12, name=None):
             "axis": 1 if axis is None else axis,
             "epsilon": epsilon,
         })
-    return F.squeeze(norm, axes=[axis])
+    return paddle.squeeze(norm, axis=[axis])
 
 
 def norm_except_dim(p, dim):
     shape = p.shape
     ndims = len(shape)
     if dim == -1:
-        return F.sqrt(F.reduce_sum(F.square(p)) + 1e-12)
+        return paddle.sqrt(paddle.sum(paddle.square(p)) + 1e-12)
     elif dim == 0:
-        p_matrix = F.reshape(p, (shape[0], -1))
+        p_matrix = paddle.reshape(p, (shape[0], -1))
         return l2_norm(p_matrix, axis=1)
     elif dim == ndims - 1:
-        p_matrix = F.reshape(p, (-1, shape[-1]))
+        p_matrix = paddle.reshape(p, (-1, shape[-1]))
         return l2_norm(p_matrix, axis=0)
     else:
         perm = list(range(ndims))
         perm[0] = dim
         perm[dim] = 0
-        p_transposed = F.transpose(p, perm)
+        p_transposed = paddle.transpose(p, perm)
         return norm_except_dim(p_transposed, 0)
 
 
@@ -66,25 +66,25 @@ def _weight_norm(v, g, dim):
     ndims = len(shape)
 
     if dim == -1:
-        v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12)
+        v_normalized = v / (paddle.sqrt(paddle.sum(paddle.square(v))) + 1e-12)
     elif dim == 0:
-        p_matrix = F.reshape(v, (shape[0], -1))
+        p_matrix = paddle.reshape(v, (shape[0], -1))
         v_normalized = F.l2_normalize(p_matrix, axis=1)
-        v_normalized = F.reshape(v_normalized, shape)
+        v_normalized = paddle.reshape(v_normalized, shape)
     elif dim == ndims - 1:
-        p_matrix = F.reshape(v, (-1, shape[-1]))
+        p_matrix = paddle.reshape(v, (-1, shape[-1]))
         v_normalized = F.l2_normalize(p_matrix, axis=0)
-        v_normalized = F.reshape(v_normalized, shape)
+        v_normalized = paddle.reshape(v_normalized, shape)
     else:
         perm = list(range(ndims))
         perm[0] = dim
         perm[dim] = 0
-        p_transposed = F.transpose(v, perm)
+        p_transposed = paddle.transpose(v, perm)
         transposed_shape = p_transposed.shape
-        p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1))
+        p_matrix = paddle.reshape(p_transposed, (p_transposed.shape[0], -1))
         v_normalized = F.l2_normalize(p_matrix, axis=1)
-        v_normalized = F.reshape(v_normalized, transposed_shape)
-        v_normalized = F.transpose(v_normalized, perm)
+        v_normalized = paddle.reshape(v_normalized, transposed_shape)
+        v_normalized = paddle.transpose(v_normalized, perm)
     weight = F.elementwise_mul(
         v_normalized, g, axis=dim if dim is not None else -1)
     return weight
@@ -130,9 +130,9 @@ class WeightNorm(object):
         layer.add_parameter(name + "_v", v)
         g = layer.create_parameter(g_var.shape, dtype=g_var.dtype)
         layer.add_parameter(name + '_g', g)
-        with dygraph.no_grad():
-            F.assign(w, v)
-            F.assign(g_var, g)
+        with paddle.no_grad():
+            paddle.assign(w, v)
+            paddle.assign(g_var, g)
         setattr(layer, name, fn.compute_weight(layer))
 
         layer.register_forward_pre_hook(fn)
@@ -145,8 +145,8 @@ class WeightNorm(object):
         del layer._parameters[self.name + '_v']
         w = layer.create_parameter(w_var.shape, dtype=w_var.dtype)
         layer.add_parameter(self.name, w)
-        with dygraph.no_grad():
-            F.assign(w_var, w)
+        with paddle.no_grad():
+            paddle.assign(w_var, w)
 
     def __call__(self, layer, inputs):
         setattr(layer, self.name, self.compute_weight(layer))
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index fc80c7cbc80f36c3afbc4229aaee63a3679b4e2b..cd8ba2b58a8c939acc43a93b0ea6ca5a617b35d1 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -243,7 +243,7 @@ def stft(x,
          normalized=False,
          onesided=True,
          name=None):
-    """
+    r"""
     Short-time Fourier transform (STFT).
 
     The STFT computes the discrete Fourier transforms (DFT) of short overlapping
@@ -398,7 +398,7 @@ def istft(x,
           length=None,
           return_complex=False,
           name=None):
-    """
+    r"""
     Inverse short-time Fourier transform (ISTFT).
 
     Reconstruct time-domain signal from the giving complex input and window tensor when
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index ee84b43e13fef086e95d12f9e48a15d2e8e8e509..b851f6db4acab8b96c5da0de831c1b4df41ff060 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from ..fluid.framework import core, in_dygraph_mode, Variable
+from ..framework import core
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 
@@ -23,6 +23,7 @@ from ..fluid.layers import rank  # noqa: F401
 from ..fluid.layers import shape  # noqa: F401
 import paddle
 from paddle import _C_ops
+from paddle.static import Variable
 
 __all__ = []
 
@@ -184,7 +185,7 @@ def real(x, name=None):
             #        [[1., 2., 3.],
             #         [4., 5., 6.]])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.real(x)
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'real')
@@ -228,7 +229,7 @@ def imag(x, name=None):
             #        [[6., 5., 4.],
             #         [3., 2., 1.]])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.imag(x)
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'imag')
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 934ccfa72640f0f6f5e6e7fea0206394144e7895..ae563e641e3c854e6d516ada20beb2dafb151578 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -18,21 +18,19 @@ from paddle.common_ops_import import fill_constant
 from ..fluid.layers import utils
 
 from ..fluid.layers import tensor
-from ..fluid.framework import Variable
-from ..fluid.framework import unique_name
-from ..fluid.framework import _current_expected_place, _get_paddle_place
-from ..fluid.framework import dygraph_only
-from ..fluid.initializer import Constant
-from ..fluid.layers import core
+from ..static import Variable, device_guard
+from ..framework import _current_expected_place, _get_paddle_place
+from ..framework import dygraph_only
+from ..framework import core
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ..fluid.framework import convert_np_dtype_to_dtype_, in_dygraph_mode, _varbase_creator, device_guard, OpProtoHolder
+from ..framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder
 from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype
 # TODO: define functions to get create a tensor  
 from ..fluid.layers import linspace  # noqa: F401
 import paddle
 from paddle import _C_ops
-from ..fluid.framework import _in_eager_mode
+from ..framework import _in_eager_mode
 
 __all__ = []
 
@@ -214,7 +212,7 @@ def full_like(x, fill_value, dtype=None, name=None):
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.fill_any_like(x, 'value', fill_value, 'dtype', dtype)
 
     helper = LayerHelper("full_like", **locals())
@@ -648,7 +646,7 @@ def tril(x, diagonal=0, name=None):
             #        [ 9, 10,  0,  0]])
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, 'tril_triu')
         return op(x, 'diagonal', diagonal, "lower", True)
 
@@ -715,7 +713,7 @@ def triu(x, diagonal=0, name=None):
             #        [ 0, 10, 11, 12]])
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, 'tril_triu')
         return op(x, 'diagonal', diagonal, "lower", False)
 
@@ -757,7 +755,7 @@ def meshgrid(*args, **kwargs):
 
     if len(args) == 1 and isinstance(args[0], (list, tuple)):
         args = args[0]
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         num = len(args)
         out = _C_ops.meshgrid(list(args), num)
         return out
@@ -862,7 +860,7 @@ def diagflat(x, offset=0, name=None):
           #  [0 0 0 4 0]]
     """
     padding_value = 0
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if len(x.shape) == 1:
             return _C_ops.diag_v2(x, "offset", offset, "padding_value",
                                   padding_value)
@@ -976,7 +974,7 @@ def diag(x, offset=0, padding_value=0, name=None):
           print(y.numpy())
           # [4]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.diag_v2(x, "offset", offset, "padding_value",
                               padding_value)
 
@@ -1057,7 +1055,7 @@ def empty(shape, dtype=None, name=None):
 
     dtype = convert_dtype(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         out = _C_ops.empty('shape', shape, 'dtype',
                            convert_np_dtype_to_dtype_(dtype))
@@ -1125,7 +1123,7 @@ def empty_like(x, dtype=None, name=None):
         dtype = x.dtype
     dtype = convert_dtype(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.empty('shape', x.shape, 'dtype',
                            convert_np_dtype_to_dtype_(dtype))
         out.stop_gradient = True
@@ -1309,7 +1307,7 @@ def complex(real, imag, name=None):
             # [[0.+0.j 0.+1.j 0.+2.j]
             #  [1.+0.j 1.+1.j 1.+2.j]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return paddle._C_ops.complex(real, imag)
 
     check_variable_and_dtype(real, 'real', ['float32', 'float64'], 'complex')
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index e5d947294d9229b300814cb64086e1ebdfc7dd86..040480c26faa8fca3e9e08cf0b69cc6cdaaeedfc 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -15,9 +15,8 @@
 import itertools
 import re
 
-from ..fluid.layers import reshape, transpose
-from .linalg import matmul
-from .manipulation import squeeze, unsqueeze
+from .linalg import matmul, transpose
+from .manipulation import squeeze, unsqueeze, reshape
 from .math import multiply
 from .math import sum as paddle_sum
 
@@ -792,10 +791,10 @@ def einsum(equation, *operands):
                 - For any free label which is not present for the output, it's lowered to
                 a dummy label.
         - Examples
-            - '...ij, ...jk'，where i and k are free labels, j is dummy. The output label
+            - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
             string is '...ik'
             - 'ij -> i', where i is a free label and j is a dummy label. 
-            - '...ij, ...jk -> ...ijk'，where i, j and k are all free labels.
+            - '...ij, ...jk -> ...ijk', where i, j and k are all free labels.
             - '...ij, ...jk -> ij', an invalid equation since `...` is not present for
             the output.
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 170889588aadbad227c03142ecdf23a290c51ba5..fef1652040835091b127324b1a5f6048f6a40bae 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -14,8 +14,9 @@
 
 import numpy as np
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import in_dygraph_mode, _varbase_creator, Variable, _dygraph_tracer
+from ..framework import _varbase_creator, _dygraph_tracer
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..static import Variable
 
 from ..fluid.layers import transpose, cast  # noqa: F401
 from ..fluid import layers
@@ -133,7 +134,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
 
     """
     op_type = 'matmul_v2'
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, op_type)
         return op(x, y, 'trans_x', transpose_x, 'trans_y', transpose_y)
 
@@ -245,7 +246,7 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
             raise ValueError(
                 "The dim of frobenius norm op should be None or two elements list!"
             )
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             if dim is None:
                 return _C_ops.frobenius_norm(input, 'keep_dim', keepdim,
                                              'reduce_all', True)
@@ -282,7 +283,7 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
           axis (int, optional): None for last dimension.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
         """
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             if axis is None: axis = -1
             return _C_ops.p_norm(input, 'porder', porder, 'axis', axis,
                                  'keepdim', keepdim, 'asvector', asvector)
@@ -642,7 +643,7 @@ def cond(x, p=None, name=None):
         axis = axis if axis != None and axis != [] else [0]
         keepdim = False
 
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             abs_out = _C_ops.abs(input)
             sum_out = _C_ops.reduce_sum(abs_out, 'dim', axis, 'keepdim',
                                         keepdim, 'reduce_all', reduce_all)
@@ -699,7 +700,7 @@ def cond(x, p=None, name=None):
         reduce_all = True if axis is None or axis == [] else False
         keepdim = False
 
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             pow_out = _C_ops.pow(input, 'factor', porder)
             sum_out_1 = _C_ops.reduce_sum(pow_out, 'dim', axis, 'keepdim',
                                           keepdim, 'reduce_all', reduce_all)
@@ -753,7 +754,7 @@ def cond(x, p=None, name=None):
 
         u, s, vh = svd(input, full_matrices=False)
 
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             if porder == "nuc":
                 return _C_ops.reduce_sum(s, 'dim', axis, 'keepdim', keepdim,
                                          'reduce_all', reduce_all)
@@ -820,7 +821,7 @@ def cond(x, p=None, name=None):
             return out
 
     def empty_tensor(input, shape):
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             return input.reshape(shape)
         raise ValueError("only support x is nonempty tensor in static mode")
 
@@ -895,7 +896,7 @@ def dot(x, y, name=None):
     """
     op_type = 'dot'
     # skip var type check in dygraph mode to improve efficiency
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, op_type)
         return op(x, y)
 
@@ -1079,7 +1080,7 @@ def t(input, name=None):
             "Input(input) only support N-D (N<=2) tensor, but received "
             "length of Input(input) is %s. Perhaps you can use paddle."
             "tensor.transpose() instead." % len(input.shape))
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if len(input.shape) == 1:
             return input
         # 2-D tensor
@@ -1144,7 +1145,7 @@ def cross(x, y, axis=None, name=None):
             #  [0. 0. 0.]
             #  [0. 0. 0.]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if axis is not None:
             return _C_ops.cross(x, y, 'dim', axis)
         else:
@@ -1203,7 +1204,7 @@ def cholesky(x, upper=False, name=None):
             #  [1.25450498 0.05600871 0.06400121]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.cholesky(x, "upper", upper)
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'cholesky')
     check_type(upper, 'upper', bool, 'cholesky')
@@ -1257,7 +1258,7 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if tol is None:
             tol_tensor = None
             tol_attr = 0.0
@@ -1355,7 +1356,7 @@ def bmm(x, y, name=None):
             "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}".
             format(x_shape, y_shape))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.bmm(x, y)
 
     helper = LayerHelper('bmm', **locals())
@@ -1388,7 +1389,7 @@ def histogram(input, bins=100, min=0, max=0, name=None):
             result = paddle.histogram(inputs, bins=4, min=0, max=3)
             print(result) # [0, 2, 1, 0]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.histogram(input, "bins", bins, "min", min, "max", max)
 
     helper = LayerHelper('histogram', **locals())
@@ -1435,7 +1436,7 @@ def bincount(x, weights=None, minlength=0, name=None):
     if x.dtype not in [paddle.int32, paddle.int64]:
         raise TypeError("Elements in Input(x) should all be integers")
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.bincount(x, weights, "minlength", minlength)
 
     helper = LayerHelper('bincount', **locals())
@@ -1488,7 +1489,7 @@ def mv(x, vec, name=None):
             vec = paddle.to_tensor(vec_data).astype("float64")
             out = paddle.mv(x, vec)
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.mv(x, vec)
         return out
 
@@ -1541,7 +1542,7 @@ def det(x, name=None):
 
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.determinant(x)
 
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'det')
@@ -1596,7 +1597,7 @@ def slogdet(x, name=None):
         # [-0.98610914, -0.43010661, -0.10872950]])
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.slogdeterminant(x)
 
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'slogdet')
@@ -1669,7 +1670,7 @@ def svd(x, full_matrices=False, name=None):
             #                  V * VH == I
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.svd(x, 'full_matrices', full_matrices)
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'svd')
     check_type(full_matrices, 'full_matrices', bool, 'svd')
@@ -1744,7 +1745,7 @@ def matrix_power(x, n, name=None):
             #  [-7.66666667 ,  8.         , -1.83333333 ],
             #  [ 1.80555556 , -1.91666667 ,  0.44444444 ]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.matrix_power(x, "n", n)
 
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'matrix_power')
@@ -1801,7 +1802,7 @@ def qr(x, mode="reduced", name=None):
             
             # one can verify : X = Q * R ;     
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         q, r = _C_ops.qr(x, 'mode', mode)
         if mode == "r":
             return r
@@ -1900,7 +1901,7 @@ def lu(x, pivot=True, get_infos=False, name=None):
 
             # one can verify : X = P @ L @ U ;     
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         LU, Piv, Info = _C_ops.lu(x, 'pivots', pivot)
         if get_infos:
             return LU, Piv, Info
@@ -1997,7 +1998,7 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
             # one can verify : X = P @ L @ U ;   
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         P, L, U = _C_ops.lu_unpack(x, y, 'unpack_ludata', unpack_ludata,
                                    'unpack_pivots', unpack_pivots)
         return P, L, U
@@ -2070,7 +2071,7 @@ def eig(x, name=None):
             #       [ (16.50471283351188+0j)  , (-5.5034820550763515+0j) ,
             #         (-0.21026087843552282+0j)])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         w, v = _C_ops.eig(x)
         return w, v
 
@@ -2139,7 +2140,7 @@ def eigvals(x, name=None):
             "The last two dimensions of Input(x) should be equal, but received x's shape = {}".
             format(x_shape))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.eigvals(x)
 
     helper = LayerHelper('eigvals', **locals())
@@ -2210,7 +2211,7 @@ def multi_dot(x, name=None):
         # [10, 7]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.multi_dot(x)
 
     check_type(x, 'x', (list, tuple), 'multi_dot')
@@ -2262,7 +2263,7 @@ def eigh(x, UPLO='L', name=None):
             #[ 0.3826834323650898j    , -0.9238795325112867j    ]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.eigh(x, 'UPLO', UPLO)
 
     def __check_input(x, UPLO):
@@ -2361,7 +2362,7 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             # or              out * x * out = x ;
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if not hermitian:
             # combine svd and matmul op
             u, s, vt = _C_ops.svd(x, 'full_matrices', False)
@@ -2611,7 +2612,7 @@ def solve(x, y, name=None):
         print(out)
         # [2., 3.])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.solve(x, y)
 
     inputs = {"X": [x], "Y": [y]}
@@ -2675,7 +2676,7 @@ def triangular_solve(x,
         print(out)
         # [7, -2, -5]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.triangular_solve(x, y, 'upper', upper, 'transpose',
                                        transpose, 'unitriangular',
                                        unitriangular)
@@ -2732,7 +2733,7 @@ def cholesky_solve(x, y, upper=False, name=None):
         print(out)
         # [-2.5, -7, 9.5]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.cholesky_solve(x, y, 'upper', upper)
 
     helper = LayerHelper("cholesky_solve", **locals())
@@ -2776,7 +2777,7 @@ def eigvalsh(x, UPLO='L', name=None):
             print(out_value)
             #[0.17157288, 5.82842712]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         is_test = x.stop_gradient
         values, _ = _C_ops.eigvalsh(x, 'UPLO', UPLO, 'is_test', is_test)
         return values
@@ -2791,7 +2792,7 @@ def eigvalsh(x, UPLO='L', name=None):
             raise ValueError(
                 "The input matrix must be batches of square matrices. But received x's dimention: {}".
                 format(x_shape))
-        if UPLO is not 'L' and UPLO is not 'U':
+        if UPLO != 'L' and UPLO != 'U':
             raise ValueError(
                 "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
 
@@ -2904,7 +2905,7 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
         elif x.dtype == paddle.float64:
             rcond = 1e-15 * max(x.shape[-2], x.shape[-1])
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         solution, rank, singular_values = _C_ops.lstsq(x, y, "rcond", rcond,
                                                        "driver", driver)
         if x.shape[-2] > x.shape[-1]:
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index a9ec48911824968801dac02877b50725951372dc..858f9139231e7c45ac35200a3fb9f3d28b21ccba 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -15,8 +15,7 @@
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.layers.layer_function_generator import templatedoc
-from .. import fluid
-from ..fluid.framework import in_dygraph_mode, Variable
+from ..static import Variable
 from ..framework import VarBase as Tensor
 
 # TODO: define logic functions of a tensor  
@@ -25,8 +24,7 @@ from ..fluid.layers import logical_and  # noqa: F401
 from ..fluid.layers import logical_not  # noqa: F401
 from ..fluid.layers import logical_or  # noqa: F401
 from ..fluid.layers import logical_xor  # noqa: F401
-
-from paddle.common_ops_import import core
+import paddle
 from paddle import _C_ops
 from paddle.tensor.creation import full
 
@@ -61,7 +59,7 @@ def equal_all(x, y, name=None):
           result2 = paddle.equal_all(x, z)
           print(result2) # result2 = [False ]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.equal_all(x, y)
 
     helper = LayerHelper("equal_all", **locals())
@@ -124,7 +122,7 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
           # [True]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.allclose(x, y, 'rtol',
                                str(rtol), 'atol',
                                str(atol), 'equal_nan', equal_nan)
@@ -182,7 +180,7 @@ def equal(x, y, name=None):
     if not isinstance(y, Variable):
         y = full(shape=[1], dtype=x.dtype, fill_value=y)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.equal(x, y)
 
     check_variable_and_dtype(
@@ -224,7 +222,7 @@ def greater_equal(x, y, name=None):
             result1 = paddle.greater_equal(x, y)
             print(result1)  # result1 = [True False True]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.greater_equal(x, y)
 
     check_variable_and_dtype(x, "x",
@@ -270,7 +268,7 @@ def greater_than(x, y, name=None):
             result1 = paddle.greater_than(x, y)
             print(result1)  # result1 = [False False True]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.greater_than(x, y)
 
     check_variable_and_dtype(x, "x",
@@ -317,7 +315,7 @@ def less_equal(x, y, name=None):
             result1 = paddle.less_equal(x, y)
             print(result1)  # result1 = [True True False]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.less_equal(x, y)
 
     check_variable_and_dtype(
@@ -360,7 +358,7 @@ def less_than(x, y, name=None):
             result1 = paddle.less_than(x, y)
             print(result1)  # result1 = [False True False]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.less_than(x, y)
 
     check_variable_and_dtype(
@@ -403,7 +401,7 @@ def not_equal(x, y, name=None):
             result1 = paddle.not_equal(x, y)
             print(result1)  # result1 = [False True True]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.not_equal(x, y)
 
     check_variable_and_dtype(
@@ -449,7 +447,7 @@ def is_tensor(x):
 
 
 def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, op_name)
         if binary_op:
             return op(x, y)
@@ -637,7 +635,7 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
           # [True, True]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.isclose(x, y, 'rtol',
                               str(rtol), 'atol',
                               str(atol), 'equal_nan', equal_nan)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index d8ebae9d6bf3944a39834167ddd15270dd171a89..53bb9a8807562866d810bcf36a0329b7cadd7ebd 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -15,11 +15,11 @@
 from __future__ import print_function
 from collections import Counter
 
-from ..fluid.layers import core
+from ..static import Variable, device_guard
+from ..framework import core
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard, dygraph_only
+from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
 import numpy as np
 # TODO: define functions to manipulate a tensor  
@@ -30,6 +30,7 @@ from ..fluid.layers import unstack  # noqa: F401
 
 from ..fluid.layers import scatter_nd  # noqa: F401
 from ..fluid.layers import shard_index  # noqa: F401
+from ..fluid.layers import crop_tensor as crop  # noqa: F401
 from ..fluid.layers.nn import _elementwise_op_in_dygraph
 from ..fluid import layers
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
@@ -377,7 +378,7 @@ def broadcast_tensors(input, name=None):
     """
 
     num_inputs = len(input)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.broadcast_tensors(input, num_inputs)
 
     check_type(input, 'input', (list, tuple), 'broadcast_tensors')
@@ -474,7 +475,7 @@ def flip(x, axis, name=None):
     """
     if isinstance(axis, int):
         axis = [axis]
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.flip(x, "axis", axis)
 
     helper = LayerHelper("flip", **locals())
@@ -670,7 +671,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
     if not (isinstance(x, Variable)):
         raise ValueError("The input x should be a Tensor")
 
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_variable_and_dtype(
             x, 'x',
             ['float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8'],
@@ -692,7 +693,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
     if start_axis > stop_axis:
         raise ValueError("The stop_axis should be larger than stat_axis")
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         dy_out, _ = _C_ops.flatten_contiguous_range(x, 'start_axis', start_axis,
                                                     'stop_axis', stop_axis)
         return dy_out
@@ -791,7 +792,7 @@ def roll(x, shifts, axis=None, name=None):
     else:
         axis = []
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.roll(x, 'axis', axis, 'shifts', shifts)
 
     helper = LayerHelper("roll", **locals())
@@ -1107,7 +1108,7 @@ def unique_consecutive(x,
     else:
         axis = [axis]
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out, inverse, counts = _C_ops.unique_consecutive(
             x, 'dtype', attr_dtype, 'return_inverse', return_inverse,
             'return_counts', return_counts, 'axis', axis)
@@ -1212,7 +1213,7 @@ def unique(x,
     else:
         axis = [axis]
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out, inverse, indices, counts = _C_ops.unique(
             x, 'dtype', attr_dtype, 'return_index', return_index,
             'return_inverse', return_inverse, 'return_counts', return_counts,
@@ -1396,7 +1397,7 @@ def gather(x, index, axis=None, name=None):
     if axis is None:
         axis = 0
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         axis = axis.item() if isinstance(axis, paddle.Tensor) else axis
         return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False)
 
@@ -1470,7 +1471,7 @@ def unbind(input, axis=0):
     input_shape = input.shape
     axis_ = axis if axis >= 0 else len(input_shape) + axis
     num = input_shape[axis_]
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.unbind(input, num, 'axis', axis)
 
     helper = LayerHelper("unbind", **locals())
@@ -1564,7 +1565,7 @@ def scatter(x, index, updates, overwrite=True, name=None):
             #  [2., 2.],
             #  [1., 1.]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.scatter(x, index, updates, 'overwrite', overwrite)
 
     check_variable_and_dtype(
@@ -1743,7 +1744,7 @@ def tile(x, repeat_times, name=None):
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.tile(x, 'repeat_times', repeat_times)
     check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
     if isinstance(repeat_times, Variable):
@@ -1826,7 +1827,7 @@ def expand_as(x, y, name=None):
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.expand_as_v2(x, 'target_shape', y.shape)
 
     check_variable_and_dtype(
@@ -1880,7 +1881,7 @@ def broadcast_to(x, shape, name=None):
             print(out)
             # [[1, 2, 3], [1, 2, 3]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
@@ -1967,7 +1968,7 @@ def expand(x, shape, name=None):
             print(out)
             # [[1, 2, 3], [1, 2, 3]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
@@ -2406,7 +2407,7 @@ def tensordot(x, y, axes=2, name=None):
     check_type(axes, 'axes', (int, tuple, list, Variable), op_type)
 
     def _var_to_list(var):
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             return tolist(var)
         raise TypeError(
             "The 'axes' with type 'Tensor' in " + op_type +
@@ -2522,7 +2523,7 @@ def as_complex(x, name=None):
             # [[ 0. +1.j  2. +3.j  4. +5.j]
             #  [ 6. +7.j  8. +9.j 10.+11.j]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return paddle._C_ops.as_complex(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'as_complex')
@@ -2571,7 +2572,7 @@ def as_real(x, name=None):
             #   [ 8.  9.]
             #   [10. 11.]]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return paddle._C_ops.as_real(x)
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'as_real')
@@ -2625,7 +2626,7 @@ def repeat_interleave(x, repeats, axis=None, name=None):
         x = paddle.flatten(x)
         axis = 0
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if isinstance(repeats, int):
             return _C_ops.repeat_interleave(x, None, 'Repeats', repeats, 'dim',
                                             axis)
@@ -2732,7 +2733,7 @@ def moveaxis(x, source, destination, name=None):
     for i in range(len(src_dims)):
         perm[dst_dims[i]] = src_dims[i]
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out, _ = _C_ops.transpose2(x, 'axis', perm)
         return out
 
@@ -2813,7 +2814,7 @@ def take_along_axis(arr, indices, axis):
     if not broadcast_shape:
         # if indices matrix have larger size than arr, arr should broadcast into indices shape.
         broadcast_shape = indices.shape
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         indices = paddle.broadcast_to(indices, broadcast_shape)
         broadcast_shape_list = list(broadcast_shape)
         broadcast_shape_list[axis] = list(arr.shape)[axis]
@@ -2878,7 +2879,7 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
             "`indices` and `arr` must have the same number of dimensions!")
     axis = non_negative_axis(arr, axis)
     broadcast_shape = infer_broadcast_shape(arr, indices, axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         values = paddle.to_tensor(values) if not isinstance(
             values, paddle.Tensor) else values
         if broadcast_shape:
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ba8a4d7f11990ccaad2f387bac9c3477cf9ed825..ce29e9dce81809a9745d6efbe6da419878423e00 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -26,8 +26,9 @@ from paddle.common_ops_import import dygraph_utils
 from paddle.tensor import cast
 from paddle.tensor.attribute import _complex_to_real_dtype
 import paddle
-from ..fluid import layers
-from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable, convert_np_dtype_to_dtype_
+from paddle.static import Variable
+from ..framework import core
+from ..framework import _varbase_creator, convert_np_dtype_to_dtype_
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
@@ -70,7 +71,8 @@ from ..fluid.layers import acosh    # noqa: F401
 from ..fluid.layers import atanh    # noqa: F401
 
 from ..fluid.layers import multiplex    # noqa: F401
-from ..fluid import layers
+from ..fluid.layers import reduce_prod
+from ..fluid.layers import elementwise_sub
 from paddle import _C_ops
 
 __all__ = []
@@ -147,7 +149,7 @@ def pow(x, y, name=None):
 
     """
     # in dynamic graph mode
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if isinstance(y, (int, float)):
             return _C_ops.pow(x, 'factor', y)
         elif isinstance(y, (paddle.Tensor, Variable)):
@@ -240,7 +242,7 @@ def add(x, y, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.elementwise_add(x, y)
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
@@ -319,7 +321,7 @@ def subtract(x, y, name=None):
     op_type = 'elementwise_sub'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -376,7 +378,7 @@ def divide(x, y, name=None):
     op_type = 'elementwise_div'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
@@ -415,7 +417,7 @@ def floor_divide(x, y, name=None):
     """
     op_type = 'elementwise_floordiv'
     axis = -1
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
@@ -455,7 +457,7 @@ def remainder(x, y, name=None):
     """
     op_type = 'elementwise_mod'
     axis = -1
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
@@ -505,7 +507,7 @@ def multiply(x, y, name=None):
     act = None
     axis = -1
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
@@ -570,7 +572,7 @@ def maximum(x, y, name=None):
     op_type = 'elementwise_max'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -629,7 +631,7 @@ def minimum(x, y, name=None):
     op_type = 'elementwise_min'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -690,7 +692,7 @@ def fmax(x, y, name=None):
     op_type = 'elementwise_fmax'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -751,7 +753,7 @@ def fmin(x, y, name=None):
     op_type = 'elementwise_fmin'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -860,7 +862,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         return (False, src_type)
 
     dtype_flag, dtype = get_dtype(x, dtype)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         axis = axis if axis != None and axis != [] else [0]
         if dtype_flag:
             return _C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
@@ -1024,7 +1026,7 @@ def add_n(inputs, name=None):
             # [[8., 10., 12.], 
             #  [14., 16., 18.]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if isinstance(inputs, Variable):
             inputs = [inputs]
         return _C_ops.sum(inputs, 'use_mkldnn', False)
@@ -1080,7 +1082,7 @@ def trunc(input, name=None):
             #         [[0., 0.],
             #         [0., 0.]]))
     '''
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.trunc(input)
     else:
         inputs = {"X": input}
@@ -1164,7 +1166,7 @@ def mm(input, mat2, name=None):
 
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.matmul_v2(input, mat2)
 
     def __check_input(x, y):
@@ -1269,7 +1271,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
 
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
         return out
 
@@ -1328,7 +1330,7 @@ def renorm(x, p, axis, max_norm):
         if not axis >= -1 * len(input_shape):
             raise ValueError("the axis:{} should not be less than -1 * length of input_shape:{}".format(axis,-1 * len(input_shape)))
         axis = axis + len(input_shape)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = core.ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
         return out
 
@@ -1384,7 +1386,7 @@ def inner(x, y, name=None):
         nx = x.reshape((-1, xshape[-1]))
         ny = y.reshape((-1, yshape[-1]))
 
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             return _C_ops.matmul_v2(nx, ny.T).reshape(dstshape)
 
         def __check_input(x, y):
@@ -1447,7 +1449,7 @@ def outer(x, y, name=None):
     nx = x.reshape((-1, 1))
     ny = y.reshape((1, -1))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.matmul_v2(nx, ny)
 
     def __check_input(x, y):
@@ -1516,7 +1518,7 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
     if axis is None or len(axis) == 0:
         axis = [0]
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.logsumexp(x, 'axis', axis, 'keepdim', keepdim, 'reduce_all', reduce_all)
 
     check_variable_and_dtype(x, 'x',
@@ -1560,7 +1562,7 @@ def inverse(x, name=None):
             print(inv) # [[0.5, 0], [0, 0.5]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.inverse(x)
 
     def _check_input(x):
@@ -1676,7 +1678,7 @@ def max(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_max(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
 
@@ -1776,7 +1778,7 @@ def min(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_min(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
 
@@ -1889,7 +1891,7 @@ def amax(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_amax(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
 
     helper = LayerHelper('amax', **locals())
@@ -2002,7 +2004,7 @@ def amin(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_amin(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
 
     helper = LayerHelper('amin', **locals())
@@ -2046,7 +2048,7 @@ def log1p(x, name=None):
             # [[0.], [0.6931472]]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.log1p(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], "log1p")
@@ -2095,7 +2097,7 @@ def log2(x, name=None):
             res = paddle.log2(x_i)
             print(res) # [1.0]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.log2(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], "log2")
@@ -2145,7 +2147,7 @@ def log10(x, name=None):
             res = paddle.log10(x_i)
             print(res) # [1.0]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.log10(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], "log10")
@@ -2206,7 +2208,7 @@ def clip(x, min=None, max=None, name=None):
         min_ = float(np.finfo(np.float32).min)
         max_ = float(np.finfo(np.float32).max)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if isinstance(min, Variable):
             min = min.numpy().item(0)
         if isinstance(max, Variable):
@@ -2339,7 +2341,7 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
                 "But received axis1 = %d, axis2 = %d\n"%(axis1, axis2)
 
     __check_input(input, offset, axis1, axis2)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.trace(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
 
     inputs = {'Input': [x]}
@@ -2422,7 +2424,7 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
             #        [0.17020577, 0.27325270]])
             
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.diagonal(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
 
     def __check_input(input, offset, dim1, dim2):
@@ -2499,7 +2501,7 @@ ${comment}
             #         [12, 15, 18, 16, 20, 24],
             #         [21, 24, 27, 28, 32, 36]])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.kron(x, y)
 
     helper = LayerHelper('kron', **locals())
@@ -2557,9 +2559,9 @@ def cumsum(x, axis=None, dtype=None, name=None):
     else:
         flatten = False
     if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
-        x = layers.cast(x, dtype)
+        x = cast(x, dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if axis is None:
             return _C_ops.cumsum(x, 'flatten', flatten)
         else:
@@ -2622,9 +2624,9 @@ def cumprod(x, dim=None, dtype=None, name=None):
     """
 
     if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
-        x = layers.cast(x, dtype)
+        x = cast(x, dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.cumprod(x, 'dim', dim)
 
     check_variable_and_dtype(x, "x", ['complex64', 'complex128', 'float32', 'float64', 'int32', 'int64'], 'cumprod')
@@ -2656,7 +2658,7 @@ def isfinite(x, name=None):
             out = paddle.tensor.isfinite(x)
             print(out)  # [False  True  True False  True False False]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.isfinite_v2(x)
     helper = LayerHelper("isfinite_v2", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isfinite')
@@ -2684,7 +2686,7 @@ def isinf(x, name=None):
             out = paddle.tensor.isinf(x)
             print(out)  # [ True False False  True False False False]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.isinf_v2(x)
     helper = LayerHelper("isinf_v2", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isinf')
@@ -2712,7 +2714,7 @@ def isnan(x, name=None):
             out = paddle.tensor.isnan(x)
             print(out)  # [False False False False False  True  True]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.isnan_v2(x)
     helper = LayerHelper("isnan_v2", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isnan')
@@ -2783,9 +2785,9 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
     if dtype is not None:
         check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'prod')
         if x.dtype != convert_np_dtype_to_dtype_(dtype):
-            x = layers.cast(x, dtype)
+            x = cast(x, dtype)
 
-    return layers.reduce_prod(input=x, dim=axis, keep_dim=keepdim, name=name)
+    return reduce_prod(input=x, dim=axis, keep_dim=keepdim, name=name)
 
 
 def sign(x, name=None):
@@ -2809,7 +2811,7 @@ def sign(x, name=None):
           out = paddle.sign(x=x)
           print(out)  # [1.0, 0.0, -1.0, 1.0]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.sign(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'sign')
@@ -2846,7 +2848,7 @@ def tanh(x, name=None):
             print(out)
             # [-0.37994896 -0.19737532  0.09966799  0.29131261]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.tanh(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'tanh')
@@ -2888,7 +2890,7 @@ def increment(x, value=1.0, name=None):
             # [1.]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.increment(x, 'step', value)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
@@ -2969,7 +2971,7 @@ def all(x, axis=None, keepdim=False, name=None):
         else:
             reduce_all_flag = False
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         axis = axis if axis != None and axis != [] else [0]
         return _C_ops.reduce_all(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag)
@@ -3061,7 +3063,7 @@ def any(x, axis=None, keepdim=False, name=None):
         else:
             reduce_all_flag = False
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         axis = axis if axis != None and axis != [] else [0]
         return _C_ops.reduce_any(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag)
@@ -3142,7 +3144,7 @@ def conj(x, name=None):
           #        [(4-4j), (5-5j), (6-6j)]])
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.conj(x)
 
     check_variable_and_dtype(x, "x", ['complex64', 'complex128', 'float32', 'float64', 'int32', 'int64'], 'conj')
@@ -3181,7 +3183,7 @@ def digamma(x, name=None):
             #        [ nan       ,  5.32286835]])
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.digamma(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'digamma')
@@ -3212,7 +3214,7 @@ def neg(x, name=None):
             # [0.4 0.2 -0.1 -0.3]
     """
 
-    return layers.scale(x, scale=-1.0, bias=0.0, bias_after_scale=True, act=None, name=name)
+    return scale(x, scale=-1.0, bias=0.0, bias_after_scale=True, act=None, name=name)
 
 def atan2(x, y, name=None):
     r"""
@@ -3257,7 +3259,7 @@ def atan2(x, y, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.atan2(x, y)
     else:
         check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2')
@@ -3313,7 +3315,7 @@ def logit(x, eps=None, name=None):
 
     if eps == None:
         eps = 0.0
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.logit(x, 'eps', eps)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'logit')
@@ -3356,7 +3358,7 @@ def lerp(x, y, weight, name=None):
             # out: [5.5., 6., 6.5, 7.]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         check_type(weight, 'weight', (float, paddle.Tensor, Variable), 'lerp')
         if isinstance(weight, float):
             weight = paddle.to_tensor(weight, dtype=x.dtype)
@@ -3419,7 +3421,7 @@ def erfinv(x, name=None):
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'erfinv')
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.erfinv(x)
 
     helper = LayerHelper('erfinv', **locals())
@@ -3437,7 +3439,7 @@ def erfinv_(x, name=None):
     return _C_ops.erfinv_(x)
 
 def rad2deg(x, name=None):
-    """
+    r"""
     Convert each of the elements of input x from angles in radians to degrees.
     
     Equation:
@@ -3478,7 +3480,7 @@ def rad2deg(x, name=None):
             #         [57.29578018])
     """
     rad2deg_scale = 180 / np.pi
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
         return _C_ops.scale(x, 'scale', rad2deg_scale)
@@ -3496,7 +3498,7 @@ def rad2deg(x, name=None):
         return out
 
 def deg2rad(x, name=None):
-    """
+    r"""
     Convert each of the elements of input x from degrees to angles in radians.
     
     Equation:
@@ -3531,7 +3533,7 @@ def deg2rad(x, name=None):
             #         [3.14159274])
     """
     deg2rad_scale = np.pi / 180.0
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
         return _C_ops.scale(x, 'scale', deg2rad_scale)
@@ -3615,7 +3617,7 @@ def gcd(x, y, name=None):
                   paddle.where(y_not_equal_0, paddle.mod(x, y_safe),paddle.zeros(y.shape, y.dtype)))
         return (paddle.where(x < y, y, x), paddle.where(x < y, x, y))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         while _gcd_cond_fn(x, y):
             x, y = _gcd_body_fn(x, y)
 
@@ -3749,7 +3751,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
     dtype = x.dtype
     axes = [axis]
     infer_flags = list(1 for i in range(len(axes)))
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         has_pend = False
         input_list = []
         if prepend is not None and append is not None:
@@ -3788,7 +3790,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
             op = getattr(_C_ops, "logical_xor")
             out = op(input_back, input_front)
         else:
-            out = layers.elementwise_sub(input_back, input_front, axis=axis)
+            out = elementwise_sub(input_back, input_front, axis=axis)
         return out
     else:
         check_variable_and_dtype(x, 'x', ['float32', 'float64', 'bool', 'int32', 'int64'], 'diff')
@@ -3840,7 +3842,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
                 type='logical_xor', inputs={"X": input_back, "Y": input_front}, outputs={"Out": out}
             )
         else:
-            out = layers.elementwise_sub(input_back, input_front, axis=axis)
+            out = elementwise_sub(input_back, input_front, axis=axis)
 
         return out
 
@@ -3883,7 +3885,7 @@ def angle(x, name=None):
             #  [-1.1071488 -0.7853982  0.         0.7853982]]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.angle(x)
 
     check_variable_and_dtype(x, 'x',
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 5adb9371183036981a9de170dcb9ab29ce891fc6..660803f9f7475997b19be4635b7e89aa055e9c83 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -14,13 +14,14 @@
 
 # TODO: define random functions  
 
-from ..fluid import core
-from ..fluid.framework import in_dygraph_mode, Variable, convert_np_dtype_to_dtype_, dygraph_only
+from ..framework import core
+from ..framework import convert_np_dtype_to_dtype_, dygraph_only
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, check_shape
 from ..fluid.layers import utils
 import paddle
 from paddle import _C_ops
+from paddle.static import Variable
 
 __all__ = []
 
@@ -65,7 +66,7 @@ def bernoulli(x, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.bernoulli(x)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "bernoulli")
@@ -80,7 +81,7 @@ def bernoulli(x, name=None):
 
 
 def poisson(x, name=None):
-    """
+    r"""
     This OP returns a tensor filled with random number from a Poisson Distribution.
 
     .. math::
@@ -110,7 +111,7 @@ def poisson(x, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.poisson(x)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "poisson")
@@ -173,7 +174,7 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
     assert core.is_compiled_with_rocm() == False, (
         "multinomial op is not supported on ROCM yet.")
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.multinomial(x, 'num_samples', num_samples, 'replacement',
                                   replacement)
 
@@ -231,7 +232,7 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         return _C_ops.gaussian_random('shape', shape, 'mean',
                                       float(mean), 'std',
@@ -422,7 +423,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
             # [1.00780561 3.78457445 5.81058198]  # random
 
     """
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_type(mean, 'mean', (int, float, Variable), 'normal')
         check_type(std, 'std', (int, float, Variable), 'normal')
         if isinstance(mean, Variable):
@@ -454,7 +455,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
         return gaussian(shape=shape, mean=mean, std=std, name=name)
 
     out = out * std + mean
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         out.stop_grediant = True
     return out
 
@@ -540,7 +541,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         return _C_ops.uniform_random('shape', shape, 'min',
                                      float(min), 'max',
@@ -679,7 +680,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         return _C_ops.randint('shape', shape, 'low', low, 'high', high, 'seed',
                               0, 'dtype', dtype)
@@ -846,7 +847,7 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
             "randint_like's low must less then high, but received low = {0}, "
             "high = {1}".format(low, high))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         out = _C_ops.randint('shape', shape, 'low', low, 'high', high, 'seed',
                              0, 'dtype', core.VarDesc.VarType.INT64)
@@ -911,7 +912,7 @@ def randperm(n, dtype="int64", name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.randperm('n', n, 'seed', 0, 'dtype', dtype)
 
     if n < 1:
@@ -983,7 +984,7 @@ def rand(shape, dtype=None, name=None):
 
 
 def exponential_(x, lam=1.0, name=None):
-    """
+    r"""
     This inplace OP fill input Tensor ``x`` with random number from a Exponential Distribution.
 
     ``lam`` is :math:`\lambda` parameter of Exponential Distribution. 
@@ -1014,7 +1015,7 @@ def exponential_(x, lam=1.0, name=None):
             #  [0.72520673, 0.45208144, 0.30234432]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.exponential_(x, "lambda", lam)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential")
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 2a2e7d000a1e6aac9fd9521a33d2727c87d5e365..0ba47d79050ce2bc9ba4842681825f47f059c5df 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -13,14 +13,16 @@
 # limitations under the License.
 from __future__ import print_function
 import numpy as np
+import paddle
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
-from ..fluid import core, layers
-from paddle.common_ops_import import in_dygraph_mode
+from ..fluid import layers
+from ..framework import core
 from paddle.common_ops_import import convert_np_dtype_to_dtype_
 from paddle.common_ops_import import Variable
 from paddle.common_ops_import import VarDesc
 from paddle import _C_ops
+from .logic import logical_not
 
 # TODO: define searching & indexing functions of a tensor  
 # from ..fluid.layers import has_inf  #DEFINE_ALIAS
@@ -88,7 +90,7 @@ def argsort(x, axis=-1, descending=False, name=None):
             #  [1 1 0 2]
             #  [0 2 1 1]]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         _, ids = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
         return ids
     check_variable_and_dtype(
@@ -165,7 +167,7 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
         flatten = True
         axis = 0
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.arg_max(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
                              keepdim, 'flatten', flatten)
         return out
@@ -242,7 +244,7 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
         flatten = True
         axis = 0
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.arg_min(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
                              keepdim, 'flatten', flatten)
         return out
@@ -302,7 +304,7 @@ def index_select(x, index, axis=0, name=None):
             # [ 9. 10. 10.]]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.index_select(x, index, 'dim', axis)
 
     helper = LayerHelper("index_select", **locals())
@@ -378,7 +380,7 @@ def nonzero(x, as_tuple=False):
     shape = x.shape
     rank = len(shape)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         outs = _C_ops.where_index(x)
     else:
         outs = layers.where(x)
@@ -390,7 +392,7 @@ def nonzero(x, as_tuple=False):
     else:
         for i in range(rank):
             list_out.append(
-                layers.slice(
+                paddle.slice(
                     outs, axes=[1], starts=[i], ends=[i + 1]))
         return tuple(list_out)
 
@@ -452,7 +454,7 @@ def sort(x, axis=-1, descending=False, name=None):
             #  [4. 7. 4. 6.]
             #  [5. 7. 7. 9.]]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out, _ = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
         return out
     helper = LayerHelper("sort", **locals())
@@ -501,7 +503,7 @@ def mode(x, axis=-1, keepdim=False, name=None):
            #    [1, 0]]))
            
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.mode(x, "axis", axis, "keepdim", keepdim)
 
     helper = LayerHelper("mode", **locals())
@@ -540,9 +542,9 @@ def where(condition, x=None, y=None, name=None):
 
 
     Args:
-        condition(Tensor): The condition to choose x or y.
-        x(Tensor, optional): x is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
-        y(Tensor, optional): y is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
+        condition(Tensor): The condition to choose x or y. When True(nonzero), yield x, otherwise yield y.
+        x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
+        y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
 
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -569,13 +571,19 @@ def where(condition, x=None, y=None, name=None):
           #            [[2],
           #             [3]]),)
     """
+    if np.isscalar(x):
+        x = layers.fill_constant([1], np.array([x]).dtype.name, x)
+
+    if np.isscalar(y):
+        y = layers.fill_constant([1], np.array([y]).dtype.name, y)
+
     if x is None and y is None:
         return nonzero(condition, as_tuple=True)
 
     if x is None or y is None:
         raise ValueError("either both or neither of x and y should be given")
 
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_variable_and_dtype(condition, 'condition', ['bool'], 'where')
         check_variable_and_dtype(
             x, 'x', ['float32', 'float64', 'int32', 'int64'], 'where')
@@ -592,28 +600,27 @@ def where(condition, x=None, y=None, name=None):
         broadcast_y = y
     else:
         if core.is_compiled_with_xpu():
-            cond_int = layers.cast(condition, x.dtype)
-            cond_not_int = layers.cast(layers.logical_not(condition), x.dtype)
-            out1 = layers.elementwise_mul(x, cond_int)
-            out2 = layers.elementwise_mul(y, cond_not_int)
-            out = layers.elementwise_add(out1, out2)
+            cond_int = paddle.cast(condition, x.dtype)
+            cond_not_int = paddle.cast(logical_not(condition), x.dtype)
+            out1 = paddle.multiply(x, cond_int)
+            out2 = paddle.multiply(y, cond_not_int)
+            out = paddle.add(out1, out2)
             return out
 
-        zeros_like_x = layers.zeros_like(x)
-        zeros_like_y = layers.zeros_like(y)
-        zeros_like_condition = layers.zeros_like(condition)
-        zeros_like_condition = layers.cast(zeros_like_condition, x.dtype)
-        cast_cond = layers.cast(condition, x.dtype)
-
-        broadcast_zeros = layers.elementwise_add(zeros_like_x, zeros_like_y)
-        broadcast_zeros = layers.elementwise_add(broadcast_zeros,
-                                                 zeros_like_condition)
-        broadcast_x = layers.elementwise_add(x, broadcast_zeros)
-        broadcast_y = layers.elementwise_add(y, broadcast_zeros)
-        broadcast_condition = layers.elementwise_add(cast_cond, broadcast_zeros)
-        broadcast_condition = layers.cast(broadcast_condition, 'bool')
-
-    if in_dygraph_mode():
+        zeros_like_x = paddle.zeros_like(x)
+        zeros_like_y = paddle.zeros_like(y)
+        zeros_like_condition = paddle.zeros_like(condition)
+        zeros_like_condition = paddle.cast(zeros_like_condition, x.dtype)
+        cast_cond = paddle.cast(condition, x.dtype)
+
+        broadcast_zeros = paddle.add(zeros_like_x, zeros_like_y)
+        broadcast_zeros = paddle.add(broadcast_zeros, zeros_like_condition)
+        broadcast_x = paddle.add(x, broadcast_zeros)
+        broadcast_y = paddle.add(y, broadcast_zeros)
+        broadcast_condition = paddle.add(cast_cond, broadcast_zeros)
+        broadcast_condition = paddle.cast(broadcast_condition, 'bool')
+
+    if paddle.in_dynamic_mode():
         return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y)
     else:
         helper = LayerHelper("where", **locals())
@@ -704,7 +711,7 @@ def index_sample(x, index):
             # [1200 1100]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.index_sample(x, index)
 
     helper = LayerHelper("index_sample", **locals())
@@ -752,7 +759,7 @@ def masked_select(x, mask, name=None):
             #[1.0 5.0 6.0 9.0]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.masked_select(x, mask)
 
     helper = LayerHelper("masked_select", **locals())
@@ -822,7 +829,7 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
            # [[1 1 0 0]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         k = k.numpy().item(0) if isinstance(k, Variable) else k
         if axis is None:
             out, indices = _C_ops.top_k_v2(x, 'k',
@@ -906,7 +913,7 @@ def searchsorted(sorted_sequence,
             
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.searchsorted(sorted_sequence, values, "out_int32",
                                    out_int32, "right", right)
 
@@ -969,7 +976,7 @@ def kthvalue(x, k, axis=None, keepdim=False, name=None):
             #  [[0, 2],
             #  [1, 2]]))
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if axis is not None:
             return _C_ops.kthvalue(x, 'k', k, "axis", axis, "keepdim", keepdim)
         else:
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index d54c7fe74dab7cc2f9980fcf80bb0c64572d4d7b..468aa460486275f78d240bdee40b9d73f07dbcda 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -15,10 +15,9 @@
 # TODO: define statistical functions of a tensor  
 
 import numpy as np
-from ..fluid.framework import Variable
+from ..static import Variable
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import core, in_dygraph_mode
-from ..fluid import layers
+from ..framework import core
 from .search import where
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 import paddle
@@ -88,7 +87,7 @@ def mean(x, axis=None, keepdim=False, name=None):
     if axis is None or len(axis) == 0:
         axis = [0]
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_mean(x, 'dim', axis, 'keep_dim', keepdim,
                                   'reduce_all', reduce_all)
 
@@ -150,7 +149,7 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
             out2 = paddle.var(x, axis=1)
             # [1.         4.33333333]
     """
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'var')
 
     u = mean(x, axis, True, name)
@@ -209,7 +208,7 @@ def std(x, axis=None, unbiased=True, keepdim=False, name=None):
             out2 = paddle.std(x, axis=1)
             # [1.       2.081666]
     """
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'std')
 
     out = var(**locals())
@@ -237,7 +236,7 @@ def numel(x, name=None):
 
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.size(x)
 
     if not isinstance(x, Variable):
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 0e76d92ca73ef35ede331d19683cbd6e22013141..85672ec7a36e698b199669c167488ced17d51837 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -14,7 +14,7 @@
 
 import paddle
 import numpy as np
-from paddle.fluid.layers import core
+from ..framework import core
 from paddle.fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 
 __all__ = []
diff --git a/python/paddle/tests/test_progressbar.py b/python/paddle/tests/test_progressbar.py
index 4726522918238a2f88b73edbdebb3dea6fbe1281..a68aee7aa8f896eeae0901d1e0144b6f435c5d8b 100644
--- a/python/paddle/tests/test_progressbar.py
+++ b/python/paddle/tests/test_progressbar.py
@@ -41,6 +41,7 @@ class TestProgressBar(unittest.TestCase):
         progbar.update(1, [['loss', 1e-4]])
         progbar.update(1, [['loss', np.array([1.])]])
         progbar.update(1, [['loss', np.array([1e-4])]])
+        progbar.update(1, [['loss', np.array([1]).astype(np.uint16)]])
         progbar.start()
 
         progbar.update(0, values)
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 390ccdd157363260435b97993b3181a8532a2d15..7ea8493b67fd6dec6f46df8ca854bbd700ffbfa6 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -34,6 +34,11 @@
   kernel :
     func : conj
 
+- api : copy_to
+  args : (Tensor x, Backend backend, bool blocking)
+  output : Tensor
+  invoke : copy_to_impl(x, backend, blocking)
+
 - api : divide
   args : (Tensor x, Tensor y)
   output : Tensor
@@ -162,6 +167,11 @@
   kernel :
     func : sign
 
+- api : split
+  args : (Tensor x, ScalarArray num_or_sections, Scalar axis)
+  output : Tensor[]
+  invoke : split_impl(x, num_or_sections, axis)
+
 - api : subtract
   args : (Tensor x, Tensor y)
   output : Tensor
@@ -177,7 +187,6 @@
     func : SumInferMeta
   kernel :
     func : sum
-    param : [x, axis, dtype, keep_dim]
     data_type : x
 
 - api : zeros_like
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 997b64db967916eac297b52074277bafbc67c2e7..5fc9dfe3f6499701f75fffc62bdcf3f9a0c28821 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -35,7 +35,7 @@ class BaseAPI(object):
         # args_str:
         #     args_declare : "str" // str of function params with default value. Example: (..., bool flag=false)
         #     args_define : "str" // str of function params without default value. Example: (..., bool flag)
-        self.inputs, self.attrs, self.outputs, self.args_str = self.parse_args(
+        self.inputs, self.attrs, self.outputs, self.args_str, self.optional_vars = self.parse_args(
             self.api, api_item_yaml)
 
         self.is_base_api = True
@@ -57,17 +57,22 @@ class BaseAPI(object):
         return self.api
 
     def parse_args(self, api_name, api_item_yaml):
+        optional_vars = []
+        if 'optional' in api_item_yaml:
+            optional_vars = [
+                item.strip() for item in api_item_yaml['optional'].split(',')
+            ]
         inputs, attrs, args_str = self.parse_input_and_attr(
-            api_name, api_item_yaml['args'])
+            api_name, api_item_yaml['args'], optional_vars)
         output_type_list, output_names, return_type = self.parse_output(
             api_name, api_item_yaml['output'])
         return inputs, attrs, {
             'names': output_names,
             'types': output_type_list,
             'return_type': return_type
-        }, args_str
+        }, args_str, optional_vars
 
-    def parse_input_and_attr(self, api_name, args_config):
+    def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
         inputs = {'names': [], 'input_info': {}}
         attrs = {'names': [], 'attr_info': {}}
         args_str = args_config.strip()
@@ -79,11 +84,43 @@ class BaseAPI(object):
             'Tensor': 'const Tensor&',
             'Tensor[]': 'const std::vector<Tensor>&'
         }
-        attr_types_map = {'ScalarArray' : 'const ScalarArray&', 'Scalar' : 'const Scalar&', \
-                      'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
-                      'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
-                      'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \
-                      'int64_t[]' : 'const std::vector<int64_t>&', 'int[]' : 'const std::vector<int>&'}
+        attr_types_map = {
+            'ScalarArray': 'const ScalarArray&',
+            'Scalar': 'const Scalar&',
+            'int': 'int',
+            'int32_t': 'int32_t',
+            'int64_t': 'int64_t',
+            'long': 'long',
+            'size_t': 'size_t',
+            'float': 'float',
+            'double': 'double',
+            'bool': 'bool',
+            'Backend': 'Backend',
+            'DataLayout': 'DataLayout',
+            'DataType': 'DataType',
+            'int64_t[]': 'const std::vector<int64_t>&',
+            'int[]': 'const std::vector<int>&',
+            'long[]': 'const std::vector<int64_t>&'
+        }
+        optional_types_trans = {
+            'Tensor': 'const paddle::optional<Tensor>&',
+            'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
+            'ScalarArray': 'const paddle::optional<ScalarArray>&',
+            'Scalar': 'const paddle::optional<Scalar>&',
+            'int': 'paddle::optional<int>',
+            'int32_t': 'paddle::optional<int32_t>',
+            'int64_t': 'paddle::optional<int64_t>',
+            'size_t': 'paddle::optional<size_t>',
+            'float': 'paddle::optional<float>',
+            'double': 'paddle::optional<double>',
+            'bool': 'paddle::optional<bool>',
+            'Backend': 'paddle::optional<Backend>',
+            'DataLayout': 'paddle::optional<DataLayout>',
+            'DataType': 'paddle::optional<DataType>',
+            'int64_t[]': 'paddle::optional<std::vector<int64_t>>',
+            'int[]': 'paddle::optional<std::vector<int>>'
+        }
+
         args_declare_str = ""
         args_define_str = ""
 
@@ -100,6 +137,9 @@ class BaseAPI(object):
                     assert len(attrs['names']) == 0, \
                         f"The input Tensor should appear before attributes. please check the position of {api_name}:input({input_name}) in yaml"
 
+                    if input_name in optional_vars:
+                        in_type = optional_types_trans[in_type_symbol]
+
                     inputs['names'].append(input_name)
                     inputs['input_info'][input_name] = in_type
                     args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
@@ -121,6 +161,9 @@ class BaseAPI(object):
                         attr_name = attr_infos[0].strip()
                         default_value = attr_infos[1].strip()
 
+                    if attr_name in optional_vars:
+                        attr_type = optional_types_trans[attr_type_symbol]
+
                     default_value_str = "" if default_value is None else '=' + default_value
                     args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
                     args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
@@ -381,7 +424,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         || kernel_layout == DataLayout::UNDEFINED
         || kernel_data_type == DataType::UNDEFINED ) {{
     auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
     if (kernel_backend == Backend::UNDEFINED) {{
       kernel_backend = kernel_key.backend();
     }}
@@ -408,7 +451,17 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         param_code = ""
         for param in infer_meta_params:
             if param in input_names:
-                param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
+                if param in self.optional_vars:
+                    meta_tensor_code = meta_tensor_code + f"""
+{code_indent}  paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none);
+{code_indent}  auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
+{code_indent}  if ({PREFIX_TENSOR_NAME}meta_{param}) {{
+{code_indent}    {PREFIX_TENSOR_NAME}meta_ref_{param} = paddle::make_optional<const phi::MetaTensor&>(*{PREFIX_TENSOR_NAME}meta_{param});
+{code_indent}  }}"""
+
+                    param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, "
+                else:
+                    param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
             elif param in kernel_output_names:
                 meta_tensor_code = meta_tensor_code + code_indent + "  phi::MetaTensor " + param.replace(
                     'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n"
@@ -435,7 +488,11 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
             'const std::vector<Tensor>&':
             'const std::vector<phi::DenseTensor>&',
             'const std::vector<Tensor> &':
-            'const std::vector<phi::DenseTensor>&'
+            'const std::vector<phi::DenseTensor>&',
+            'const paddle::optional<Tensor>&':
+            'paddle::optional<const phi::DenseTensor&>',
+            'const paddle::optional<std::vector<Tensor>>&':
+            'paddle::optional<const std::vector<phi::DenseTensor>&>'
         }
         out_trans_map = {
             'Tensor': 'phi::DenseTensor*',
@@ -459,19 +516,40 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
                     trans_flag = "{true}"
                 elif input_name in self.data_transform['support_trans_dtype']:
                     trans_flag = "{false, true}"
-                input_tensor_code = input_tensor_code + f"""
+                if input_name in self.optional_vars:
+                    input_tensor_code = input_tensor_code + f"""
+{code_indent}  {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none);
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_ptr = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});
+{code_indent}  if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{
+{code_indent}    {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional<const phi::DenseTensor&>(*{PREFIX_TENSOR_NAME}{input_name}_ptr);
+{code_indent}  }}"""
+
+                else:
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});"""
 
             else:
-                input_tensor_code = input_tensor_code + f"""
+                if input_name in self.optional_vars:
+                    input_tensor_code = input_tensor_code + f"""
+{code_indent}  {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none);
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_ptr = TensorToDenseTensor({input_name});
+{code_indent}  if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{
+{code_indent}    {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional<const phi::DenseTensor&>(*{PREFIX_TENSOR_NAME}{input_name}_ptr);
+{code_indent}  }}"""
+
+                else:
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = TensorToDenseTensor({input_name});"""
 
         kernel_args = "*dev_ctx, "
         for param in kernel_param:
             if param in input_names:
-                kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
-                kernel_args_type_list.append(input_trans_map[input_infos[
-                    param]])
+                if param in self.optional_vars:
+                    kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
+                else:
+                    kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                kernel_in_type = input_trans_map[input_infos[param]]
+                kernel_args_type_list.append(kernel_in_type)
             elif param in attr_names:
                 # set attr for kernel_context
                 if 'ScalarArray' in self.attrs['attr_info'][param][0]:
@@ -499,21 +577,16 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
     def get_selected_rows_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::SelectedRows&',
-            'const Tensor &': 'const phi::SelectedRows&'
+            'const Tensor &': 'const phi::SelectedRows&',
+            'const paddle::optional<Tensor>&':
+            'paddle::optional<const phi::SelectedRows&>'
         }
         out_trans_map = {'Tensor': 'phi::SelectedRows*'}
         input_names = self.inputs['names']
         input_infos = self.inputs['input_info']
         kernel_args_type_list = ['const platform::DeviceContext&']
 
-        input_tensor_code = ""
-        for input_name in input_names:
-            # set input code
-            input_tensor_code = input_tensor_code + f"""
-      auto {PREFIX_TENSOR_NAME}{input_name} = TensorToSelectedRows({input_name});"""
-
         attr_names = self.attrs['names']
-
         kernel_param = self.kernel['param']
         if kernel_param is None:
             kernel_param = input_names + attr_names
@@ -521,15 +594,28 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         input_tensor_code = ""
         for i, input_name in enumerate(input_names):
             # set input code
-            input_tensor_code = input_tensor_code + f"""
+            if input_name in self.optional_vars:
+                input_tensor_code = input_tensor_code + f"""
+
+{code_indent}  {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none);
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_ptr = TensorToSelectedRows({input_name});
+{code_indent}  if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{
+{code_indent}    {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional<const phi::SelectedRows&>(*{PREFIX_TENSOR_NAME}{input_name}_ptr);
+{code_indent}  }}"""
+
+            else:
+                input_tensor_code = input_tensor_code + f"""
 {code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = TensorToSelectedRows({input_name});"""
 
         kernel_args = "*dev_ctx, "
         for param in kernel_param:
             if param in input_names:
-                kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
-                kernel_args_type_list.append(input_trans_map[input_infos[
-                    param]])
+                if param in self.optional_vars:
+                    kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
+                else:
+                    kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                kernel_in_type = input_trans_map[input_infos[param]]
+                kernel_args_type_list.append(kernel_in_type)
             elif param in attr_names:
                 # set attr for kernel_context
                 if 'ScalarArray' in self.attrs['attr_info'][param][0]:
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 106f698fd4b1e659c51f2ed09537a647e82d34d5..a26630ad04100fbebdb7c270b83912bb722040d4 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -92,6 +92,7 @@ def header_include():
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
 """
 
 
@@ -102,6 +103,7 @@ def source_include(header_file_path):
 
 #include "glog/logging.h"
 
+#include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/api_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
@@ -118,7 +120,7 @@ def source_include(header_file_path):
 
 def api_register():
     return """
-PT_REGISTER_API(Math);
+PD_REGISTER_API(Math);
 """
 
 
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 62b724432e9283613f69852ec04eda55a88b0ab2..cdda5cb1f05e84f05f468837dae1f59116fa293f 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -8,6 +8,17 @@
   kernel :
     func : matmul_grad
 
+- backward_api : matmul_double_grad
+  forward : matmul_grad (Tensor x, Tensor y, Tensor out_grad, bool transpose_x, bool transpose_y) -> Tensor(dx), Tensor(dy)
+  args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y)
+  output : Tensor(d2x), Tensor(d2y), Tensor(dout_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x, y, out_grad]
+  kernel :
+    func : matmul_double_grad
+  optional : dx_grad, dy_grad
+
 - backward_api : scale_grad
   forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out)
   args : (Tensor out_grad, Scalar scale, float bias=0.0, bool bias_after_scale=true)
@@ -15,15 +26,6 @@
   invoke : scale(out_grad, scale, bias, bias_after_scale)
 
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
-#
-# - backward_api : matmul_double_grad
-#   forward : matmul_grad (Tensor x, Tensor y, Tensor out_grad, bool transpose_x, bool transpose_y) -> Tensor(dx), Tensor>(dy)
-#   args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y)
-#   output : Tensor(d2x), Tensor(d2y), Tensor(dout_grad)
-#   infer_meta :
-#     func : MatmulDoubleGradInferMeta
-#   kernel :
-#     func : matmul_double_grad
 
 # - backward_api : matmul_triple_grad
 #   forward : matmul_double_grad (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y) -> Tensor(d2x), Tensor(d2y), Tensor(dout_grad)
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 28eb1de37b697d22ac37fd5ff03a0b4debcbd2b3..2d33cd5b1812ada8fca118c0e0f616cfbe511dd1 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -31,10 +31,10 @@ class BackwardAPI(BaseAPI):
     def parse_forward_config(self, forward_config):
         # api_name (const Tensor& input, ... , int attr, ...) -> Tensor(out)
         result = re.search(
-            r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->[^\(]*\((?P<outputs>[^\)]+)\)",
+            r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->\s*(?P<outputs>.+)",
             forward_config)
         api = result.group('api')
-        outputs = [item.strip() for item in result.group('outputs').split(',')]
+        _, outputs, _ = self.parse_output(self.api, result.group('outputs'))
         fw_inputs, fw_attrs, _, = self.parse_input_and_attr(
             api, result.group('args'))
 
@@ -47,7 +47,7 @@ class BackwardAPI(BaseAPI):
 
         # check the inputs of backward
         for input in self.inputs['names']:
-            if input not in fw_inputs and input not in fw_outputs:
+            if input not in fw_inputs['names'] and input not in fw_outputs:
                 if input.endswith('_grad'):
                     original_name = input[:-5]
                     assert original_name in fw_outputs, \
@@ -132,6 +132,7 @@ def header_include():
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
 """
 
 
@@ -142,6 +143,7 @@ def source_include(header_file_path):
 
 #include "glog/logging.h"
 
+#include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/api_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
index 53270c0546eae1f13d67f6f640c97d83e5be60d4..0d018f8e3f64fc2f9a89e78d81d3a392e799b441 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
@@ -26,7 +26,7 @@ def get_wrapped_infermeta_name(api_name):
 def gene_wrapped_infermeta_and_register(api):
     if api.is_base_api and not api.is_dygraph_api:
         register_code = f"""
-PT_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{api.infer_meta['func']});"""
+PD_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{api.infer_meta['func']});"""
 
         if api.infer_meta['param'] is not None:
             kernel_params = api.kernel['param']
@@ -73,7 +73,7 @@ void {wrapped_infermeta_name}({", ".join(args)}) {{
 """
 
             register_code = f"""
-PT_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{get_wrapped_infermeta_name(api.kernel['func'][0])});"""
+PD_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{get_wrapped_infermeta_name(api.kernel['func'][0])});"""
 
             return declare_code, defind_code, register_code
         else:
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 68cd3ae72a6aa0a7d830a7fc7e6c590d7bc6a216..03060e92bdb69b1ec6022d887d01c514cb11b45d 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -949,8 +949,8 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     if isinstance(output_size, int):
         output_size = (output_size, output_size)
     pooled_height, pooled_width = output_size
-    assert (len(x.shape) == 4,
-            "Input features with shape should be (N, C, H, W)")
+    assert len(x.shape) == 4, \
+            "Input features with shape should be (N, C, H, W)"
     output_channels = int(x.shape[1] / (pooled_height * pooled_width))
     if in_dygraph_mode():
         return _C_ops.psroi_pool(x, boxes, boxes_num, "output_channels",
diff --git a/python/requirements.txt b/python/requirements.txt
index f2a4580a94e51fd400293ce64f59e9db4b1ff7ed..5f2b788a81a0ad5b8150ee065602e7b643591ea2 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -5,3 +5,4 @@ Pillow
 six
 decorator
 astor
+paddle_bfloat==0.1.2
diff --git a/python/setup.py.in b/python/setup.py.in
index 7b3909d40a01b35ed81881275be72589a75d336c..ec1b1cbcb9510c80a42dff49fa1a5121a9cb487f 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -571,15 +571,15 @@ def find_files(pattern, root, recursive=False):
 headers = (
     # paddle level api headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api')) +  # pten unify api header
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api')) +  # phi unify api header
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/ext')) +  # custom op api
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) +  # pten api
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) +  # pten common headers
-    # pten level api headers (low level api)
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # pten core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) +  # phi api
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) +  # phi common headers
+    # phi level api headers (low level api)
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) +  # phi backends headers
     # utila api headers
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'] +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) +  # paddle utils headers
     ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index fe8382faa0c34c600c7a228baac34803ecc1492e..9165764adcaf4cccd3e9ad32fea28ca883761af0 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -3,6 +3,7 @@ coverage
 pycrypto ; platform_system != "Windows"
 mock
 gym
+pygame==2.1.0
 hypothesis
 opencv-python<=4.2.0.32
 visualdl
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 7823646ff7bcb8149caaab7971632c7c8b708c48..55d2d59c7ece6a4639b1227f600a7d208a69f2e7 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -250,31 +250,42 @@ if [ "${EMPTY_GRAD_OP_REGISTERED}" != "" ] && [ "${GIT_PT_ID}" != "" ]; then
     check_approval 1 43953930 46782768 22165420 22361972
 fi
 
-HAS_MODIFIED_PTEN_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/" || true`
-PTEN_INCLUDE_FLUID_FILES=""
-for CHANGE_FILE in ${HAS_MODIFIED_PTEN_FILES}; do
-    PTEN_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep "#include \"paddle/fluid/" || true`
-    if [ "${PTEN_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-        PTEN_INCLUDE_FLUID_FILES="${PTEN_INCLUDE_FLUID_FILES} ${CHANGE_FILE}"
+HAS_MODIFIED_PHI_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/" || true`
+PHI_INCLUDE_FLUID_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep "#include \"paddle/fluid/" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_INCLUDE_FLUID_FILES="${PHI_INCLUDE_FLUID_FILES} ${CHANGE_FILE}"
     fi 
 done
-if [ "${PTEN_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (chenwhql, MingMingShangTian, YuanRisheng or zyfncg) approval for the including paddle/fluid header in paddle/phi files(${PTEN_INCLUDE_FLUID_FILES}).\n"
+if [ "${PHI_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (chenwhql, MingMingShangTian, YuanRisheng or zyfncg) approval for the including paddle/fluid header in paddle/phi files(${PHI_INCLUDE_FLUID_FILES}).\n"
     check_approval 1 chenwhql MingMingShangTian YuanRisheng zyfncg
 fi
 
-HAS_MODIFIED_PTEN_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels" || true`
-PTEN_USE_MUTABLE_DATA_FILES=""
-for CHANGE_FILE in ${HAS_MODIFIED_PTEN_KERNEL_FILES}; do
-    PTEN_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
-    if [ "${PTEN_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-        PTEN_USE_MUTABLE_DATA_FILES="${PTEN_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
+HAS_MODIFIED_PHI_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels" || true`
+PHI_USE_MUTABLE_DATA_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_KERNEL_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_USE_MUTABLE_DATA_FILES="${PHI_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
     fi 
 done
-if [ "${PTEN_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/phi/kernels files(${PTEN_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n"
+if [ "${PHI_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/phi/kernels files(${PHI_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n"
     check_approval 1 chenwhql Shixiaowei02 MingMingShangTian YuanRisheng zyfncg
 fi
+PHI_USE_HOSTALLOC_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_KERNEL_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "HostAlloc" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_USE_HOSTALLOC_FILES="${PHI_USE_HOSTALLOC_FILES} ${CHANGE_FILE}"
+    fi
+done
+if [ "${PHI_USE_HOSTALLOC_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (phlrain, chenwhql) approval for the usage of phi::DeviceContext::HostAlloc() method in paddle/phi/kernels files(${PHI_USE_HOSTALLOC_FILES})\n"
+    check_approval 1 phlrain chenwhql
+fi
   
 ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"`
 ALL_OPTEST_BAN_DYGRAPH_MESSAGE=""
diff --git a/tools/check_file_suffix.py b/tools/check_file_suffix.py
deleted file mode 100644
index 1d422dd6c4fe0272d6ab1425c05591d449953591..0000000000000000000000000000000000000000
--- a/tools/check_file_suffix.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import json
-
-
-def check_suffix():
-    suffix_arr = [".pyc"]
-    json_buff = ""
-    for line in sys.stdin:
-        json_buff = "".join([json_buff, line])
-    json_obj = json.loads(json_buff)
-    if not isinstance(json_obj, list):
-        print('Json String Should be a list Object\n')
-        return
-    files_with_invalid_suffix = []
-    for i in range(len(json_obj)):
-        file_name = json_obj[i]["filename"]
-        if file_name == None:
-            continue
-        for suffix in suffix_arr:
-            if file_name.endswith(suffix):
-                files_with_invalid_suffix.append(file_name)
-                break
-    if len(files_with_invalid_suffix) != 0:
-        print('Error: Find file(s): [\n')
-        for i in range(len(files_with_invalid_suffix)):
-            print('\t' + files_with_invalid_suffix[i] + '\n')
-        print(
-            ' ] end(s) with invalid suffix, Please check if these files are temporary.'
-        )
-
-
-if __name__ == "__main__":
-    check_suffix()
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 0ba60265353073a1a062583632f3f25290f3a1e3..6b90a656f0107c5ee40812d329eab94173885ecd 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -139,13 +139,10 @@ class PRChecker(object):
         """ judge is white file in pr's files. """
         isWhiteFile = False
         not_white_files = (PADDLE_ROOT + 'cmake/', PADDLE_ROOT + 'patches/',
-                           PADDLE_ROOT + 'paddle/testing/',
                            PADDLE_ROOT + 'tools/dockerfile/',
                            PADDLE_ROOT + 'tools/windows/',
                            PADDLE_ROOT + 'tools/test_runner.py',
-                           PADDLE_ROOT + 'tools/parallel_UT_rule.py',
-                           PADDLE_ROOT + 'paddle/scripts/paddle_build.sh',
-                           PADDLE_ROOT + 'paddle/scripts/paddle_build.bat')
+                           PADDLE_ROOT + 'tools/parallel_UT_rule.py')
         if 'cmakelist' in filename.lower():
             isWhiteFile = False
         elif filename.startswith((not_white_files)):
@@ -285,9 +282,21 @@ class PRChecker(object):
         file_list = []
         file_dict = self.get_pr_files()
         for filename in file_dict:
-            if filename.startswith(
-                (PADDLE_ROOT + 'python/', PADDLE_ROOT + 'paddle/fluid/')):
+            if filename.startswith(PADDLE_ROOT + 'python/'):
                 file_list.append(filename)
+            elif filename.startswith(PADDLE_ROOT + 'paddle/'):
+                if filename.startswith((PADDLE_ROOT + 'paddle/infrt',
+                                        PADDLE_ROOT + 'paddle/utils')):
+                    filterFiles.append(filename)
+                elif filename.startswith(PADDLE_ROOT + 'paddle/scripts'):
+                    if filename.startswith(
+                        (PADDLE_ROOT + 'paddle/scripts/paddle_build.sh',
+                         PADDLE_ROOT + 'paddle/scripts/paddle_build.bat')):
+                        file_list.append(filename)
+                    else:
+                        filterFiles.append(filename)
+                else:
+                    file_list.append(filename)
             else:
                 if file_dict[filename] == 'added':
                     file_list.append(filename)
diff --git a/tools/infrt/get_pten_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh
similarity index 83%
rename from tools/infrt/get_pten_kernel_function.sh
rename to tools/infrt/get_phi_kernel_function.sh
index 75009b077b823a93d939757ae9af592dd938df53..3b9f4b7273500f23d67a3062a2d4ee367c0b473b 100644
--- a/tools/infrt/get_pten_kernel_function.sh
+++ b/tools/infrt/get_phi_kernel_function.sh
@@ -24,9 +24,9 @@ set -e
 kernel_register_info_file=`mktemp`
 PADDLE_ROOT="$( cd "$( dirname "$0" )/../../" && pwd )"
 unset GREP_OPTIONS && find ${PADDLE_ROOT}/paddle/phi/kernels -name "*.c*" \
-  | xargs sed -e '/PT_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' \
+  | xargs sed -e '/PD_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' \
   | awk 'BEGIN { RS="{" }{ gsub(/\n /,""); print $0 }' \
-  | grep PT_REGISTER \
+  | grep PD_REGISTER \
   | awk -F ",|\(|\)" '{gsub(/ /,"");$1="";print}' \
   | sort -u  | awk '{gsub(/phi::/,"");gsub(/paddle::platform::/,"");gsub(/dtype::/,"");gsub(/paddle::/,"");print $0}' \
   | grep -v "_grad" > $kernel_register_info_file
@@ -38,16 +38,16 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
   --wrapped_infermeta_header_path ${temp_path}/generate.h \
   --wrapped_infermeta_source_path ${temp_path}/generate.cc
 
-grep PT_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
+grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
   | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
 
 #step 3: merge all infos
-#  @input1 => pten kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout)
+#  @input1 => phi kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout)
 #  @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name 
 #  @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has
 #             same signature with kernel function
-python3 ${PADDLE_ROOT}/tools/infrt/get_pten_kernel_info.py \
+python3 ${PADDLE_ROOT}/tools/infrt/get_phi_kernel_info.py \
   --paddle_root_path ${PADDLE_ROOT} \
   --kernel_info_file $kernel_register_info_file \
   --infermeta_wrap_file ${temp_path}/wrap_info.txt \
-  --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc
+  --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
diff --git a/tools/infrt/get_pten_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
similarity index 92%
rename from tools/infrt/get_pten_kernel_info.py
rename to tools/infrt/get_phi_kernel_info.py
index 23296fb5a943641a7451fc8583b416b5843f0046..f3e9f345da27b4c4bb06499dfc14b12cbd406715 100644
--- a/tools/infrt/get_pten_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -21,7 +21,7 @@ from typing import List, Dict, Any
 
 
 def parse_args():
-    parser = argparse.ArgumentParser("gather pten kernel and infermate info")
+    parser = argparse.ArgumentParser("gather phi kernel and infermate info")
     parser.add_argument(
         "--paddle_root_path",
         type=str,
@@ -31,7 +31,7 @@ def parse_args():
         "--kernel_info_file",
         type=str,
         required=True,
-        help="kernel info file generated by get_pten_kernel_function.sh.")
+        help="kernel info file generated by get_phi_kernel_function.sh.")
     parser.add_argument(
         "--infermeta_wrap_file",
         type=str,
@@ -41,7 +41,7 @@ def parse_args():
         "--generate_file",
         type=str,
         required=True,
-        default="../paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc",
+        default="../paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc",
         help="generated file.")
     args = parser.parse_args()
     return args
@@ -84,15 +84,15 @@ def merge(infer_meta_data, kernel_data, wrap_data):
 
 
 def gen_warn_info():
-    return """// Generated by tools/infrt/gen_pten_kernel_register.py for infrt.
+    return """// Generated by tools/infrt/gen_phi_kernel_register.py for infrt.
 // DO NOT edit or include it within paddle.
 """
 
 
 def gen_include_headers():
     return """
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h"
-#include "paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
+#include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/include/kernels.h"
 #include "paddle/phi/include/infermeta.h"
@@ -219,7 +219,7 @@ def gen_register_info(resources: List[List[str]]):
         for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
             kernel_func = gen_kernel_func(update_item[3], ctx_name,
                                           origin_dtype)
-            ir_name = '.'.join(
+            ir_name = 'pten.' + '.'.join(
                 [it.lower() for it in update_item[:3]]) + "." + ir_dtype
             res += f"""
   registry->AddKernel("{ir_name}","""
@@ -240,8 +240,8 @@ def gen_register_info(resources: List[List[str]]):
     return res
 
 
-def gen_pten_kernel_register_code(resources: List[List[str]],
-                                  src_file_path: str):
+def gen_phi_kernel_register_code(resources: List[List[str]],
+                                 src_file_path: str):
     source_file = open(src_file_path, 'w')
     source_file.write(gen_warn_info())
     source_file.write(gen_include_headers())
@@ -258,4 +258,4 @@ if __name__ == "__main__":
     kernel_data = get_kernel_info(args.kernel_info_file)
     info_meta_wrap_data = get_kernel_info(args.infermeta_wrap_file)
     out = merge(infer_meta_data, kernel_data, info_meta_wrap_data)
-    gen_pten_kernel_register_code(out, args.generate_file)
+    gen_phi_kernel_register_code(out, args.generate_file)