diff --git a/.gitignore b/.gitignore
index 62fed4090ef71208825df7eaae16c8e88df30355..cecd6fa91c754d0862d26a10833a83aa3ced819c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ build/
 build_doc/
 *.user
 *.tmp
+*.pyc
 
 .vscode
 .idea
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4c1b9c8098e9e632a4a05c491e07b1ce051c945..5b499fb43ab996b1c1780c0276faad2c37a8808a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -330,6 +330,7 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
+
 if(WITH_GPU)
     include(cuda)
     # lite subgraph compilation depends on CUDNN_ROOT,
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index 27210e5260048a57cc442fce4c6cf8657e401568..a7a9e85ffd7314ac7026fccdf45fae2fa3de09d3 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -99,7 +99,7 @@ endfunction()
 
 function(mlir_add_rewriter td_base)
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  mlir_tablegen(${td_base}.hpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
+  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
   add_public_tablegen_target(${td_base}_IncGen)
   add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
 endfunction()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 847073fb7b57c255e82bd3f229e420a68b0af079..f7c17bd7cfe7e099e0afeaf623724e12387aff44 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -116,19 +116,19 @@ function(find_fluid_modules TARGET_NAME)
   endif()
 endfunction(find_fluid_modules)
 
-set_property(GLOBAL PROPERTY PTEN_MODULES "")
-# find all pten modules is used for paddle static library
+set_property(GLOBAL PROPERTY PHI_MODULES "")
+# find all phi modules is used for paddle static library
 # for building inference libs
-function(find_pten_modules TARGET_NAME)
+function(find_phi_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
   string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
   string(FIND "${__target_path}" "phi" pos)
   if(pos GREATER 1)
-    get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
-    set(pten_modules ${pten_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY PTEN_MODULES "${pten_modules}")
+    get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
+    set(phi_modules ${phi_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
   endif()
-endfunction(find_pten_modules)
+endfunction(find_phi_modules)
 
 function(common_link TARGET_NAME)
   if (WITH_PROFILER)
@@ -324,7 +324,7 @@ function(cc_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_pten_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
       endif()
     if(cc_library_DEPS)
       # Don't need link libwarpctc.so
@@ -497,7 +497,7 @@ function(nv_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_pten_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
       endif()
       if (nv_library_DEPS)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -588,7 +588,7 @@ function(hip_library TARGET_NAME)
       else()
         hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_pten_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
       endif()
       if (hip_library_DEPS)
         add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index b8d1f4eb116a96a5d3df92d56fade77ecd529b45..c48d31f7e4f90296ecc48acb56e619aae129106e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -224,7 +224,7 @@ copy(inference_lib_dist
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
-# copy api headers for pten & custom op
+# copy api headers for phi & custom op
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/api/ext/*.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext/)
@@ -244,11 +244,11 @@ copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/extension.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 
-# the header file of pten is copied to the experimental directory,
-# the include path of pten needs to be changed to adapt to inference api path
+# the header file of phi is copied to the experimental directory,
+# the include path of phi needs to be changed to adapt to inference api path
 add_custom_command(TARGET inference_lib_dist POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten_header.cmake"
-        COMMENT "Change pten header include path to adapt to inference api path")
+        COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/phi_header.cmake"
+        COMMENT "Change phi header include path to adapt to inference api path")
 
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 8843dd2628767e8cac167db0ff115d0b63aca53a..7affd59de162d5956672e5abfbf9f4b287fb7a83 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -73,6 +73,12 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
                 list(APPEND cu_srcs ${TARGET}.cu)
             endif()
+            # rename in KP: .kps -> .cu
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+                file(COPY ${TARGET}.kps DESTINATION  ${CMAKE_CURRENT_BINARY_DIR})
+                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+                list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+            endif()
             if (WITH_NV_JETSON)
                 list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu")
             endif()
@@ -96,6 +102,12 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
                 list(APPEND hip_srcs ${TARGET}.cu)
             endif()
+            # rename in KP: .kps -> .cu
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+                file(COPY ${TARGET}.kps DESTINATION  ${CMAKE_CURRENT_BINARY_DIR})
+                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+                list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+            endif()
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
                 set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
                         ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
diff --git a/cmake/pten.cmake b/cmake/phi.cmake
similarity index 84%
rename from cmake/pten.cmake
rename to cmake/phi.cmake
index 5645ac6cfa3039afdad0514abade5c9ea9b35408..d9132b84455e7309713b99f9e574bfceb83c7b6c 100644
--- a/cmake/pten.cmake
+++ b/cmake/phi.cmake
@@ -51,7 +51,7 @@ function(generate_unify_header DIR_NAME)
     endforeach()
     # append header into extension.h
     string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}")
-    file(APPEND ${pten_extension_header_file} "#include \"${header_file}\"\n")
+    file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n")
 endfunction()
 
 # call kernel_declare need to make sure whether the target of input exists
@@ -81,6 +81,8 @@ function(kernel_declare TARGET_LIST)
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./xpu\/")
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./gpudnn\/")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
             else ()
                 # deal with device independent kernel, now we use CPU temporaary
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
@@ -94,6 +96,7 @@ function(kernel_library TARGET)
     set(cpu_srcs)
     set(gpu_srcs)
     set(xpu_srcs)
+    set(gpudnn_srcs)
     set(selected_rows_srcs)
     # parse and save the deps kerenl targets
     set(all_srcs)
@@ -101,6 +104,8 @@ function(kernel_library TARGET)
 
     set(oneValueArgs SUB_DIR)
     set(multiValueArgs SRCS DEPS)
+    set(target_build_flag 1)
+
     cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
         "${multiValueArgs}" ${ARGN})
 
@@ -123,6 +128,9 @@ function(kernel_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
                 list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
+                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
+            endif()
         endif()
         if (WITH_XPU)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
@@ -141,6 +149,7 @@ function(kernel_library TARGET)
     list(APPEND all_srcs ${cpu_srcs})
     list(APPEND all_srcs ${gpu_srcs})
     list(APPEND all_srcs ${xpu_srcs})
+    list(APPEND all_srcs ${gpudnn_srcs})
     foreach(src ${all_srcs})
         file(READ ${src} target_content)
         string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
@@ -166,21 +175,22 @@ function(kernel_library TARGET)
     list(LENGTH cpu_srcs cpu_srcs_len)
     list(LENGTH gpu_srcs gpu_srcs_len)
     list(LENGTH xpu_srcs xpu_srcs_len)
+    list(LENGTH gpudnn_srcs gpudnn_srcs_len)
     list(LENGTH selected_rows_srcs selected_rows_srcs_len)
 
     # Build Target according different src organization
     if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR
-        ${selected_rows_srcs_len} GREATER 0))
+        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND
+        (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
         # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                 nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
             endif()
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                 hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
             endif()
         else()
@@ -190,14 +200,14 @@ function(kernel_library TARGET)
             endif()
         endif()
     # If there are only specific device srcs, build target using this rule.
-    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             endif()
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             endif()
         else()
             if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
@@ -234,35 +244,40 @@ function(kernel_library TARGET)
             cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
     else()
-        message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+        set(target_build_flag 0)
     endif()
 
-    if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
-        ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
-        ${selected_rows_srcs_len} GREATER 0)
-        # append target into PTEN_KERNELS property
-        get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
-        set(pten_kernels ${pten_kernels} ${TARGET})
-        set_property(GLOBAL PROPERTY PTEN_KERNELS ${pten_kernels})
-    endif()
+    if (${target_build_flag} EQUAL 1)
+        if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
+            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
+            ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
+            # append target into PHI_KERNELS property
+            get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
+            set(phi_kernels ${phi_kernels} ${TARGET})
+            set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels})
+        endif()
 
-    # parse kernel name and auto generate kernel declaration
-    # here, we don't need to check WITH_XXX, because if not WITH_XXX, the
-    # xxx_srcs_len will be equal to 0
-    if (${common_srcs_len} GREATER 0)
-        kernel_declare(${common_srcs})
-    endif()
-    if (${cpu_srcs_len} GREATER 0)
-        kernel_declare(${cpu_srcs})
-    endif()
-    if (${gpu_srcs_len} GREATER 0)
-        kernel_declare(${gpu_srcs})
-    endif()
-    if (${xpu_srcs_len} GREATER 0)
-        kernel_declare(${xpu_srcs})
-    endif()
-    if (${selected_rows_srcs_len} GREATER 0)
-        kernel_declare(${selected_rows_srcs})
+        # parse kernel name and auto generate kernel declaration
+        # here, we don't need to check WITH_XXX, because if not WITH_XXX, the
+        # xxx_srcs_len will be equal to 0
+        if (${common_srcs_len} GREATER 0)
+            kernel_declare(${common_srcs})
+        endif()
+        if (${cpu_srcs_len} GREATER 0)
+            kernel_declare(${cpu_srcs})
+        endif()
+        if (${gpu_srcs_len} GREATER 0)
+            kernel_declare(${gpu_srcs})
+        endif()
+        if (${xpu_srcs_len} GREATER 0)
+            kernel_declare(${xpu_srcs})
+        endif()
+        if (${gpudnn_srcs_len} GREATER 0)
+            kernel_declare(${gpudnn_srcs})
+        endif()
+        if (${selected_rows_srcs_len} GREATER 0)
+            kernel_declare(${selected_rows_srcs})
+        endif()
     endif()
 endfunction()
 
diff --git a/cmake/pten_header.cmake b/cmake/phi_header.cmake
similarity index 68%
rename from cmake/pten_header.cmake
rename to cmake/phi_header.cmake
index 6341aca9ec739449448726913aac7dcb349d5ea0..c9b7e465337dd1cf9ca35f5a595221600ab33ca7 100644
--- a/cmake/pten_header.cmake
+++ b/cmake/phi_header.cmake
@@ -14,8 +14,8 @@
 
 set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir")
 
-function(pten_header_path_compat TARGET_PATH)
-message(STATUS "pten header path compat processing: ${TARGET_PATH}")
+function(phi_header_path_compat TARGET_PATH)
+message(STATUS "phi header path compat processing: ${TARGET_PATH}")
 string(FIND ${TARGET_PATH} "experimental" pos)
 if (pos GREATER 1)
     file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
@@ -25,17 +25,17 @@ if (pos GREATER 1)
             string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/" HEADER_CONTENT "${HEADER_CONTENT}")
             string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/" HEADER_CONTENT "${HEADER_CONTENT}")
             file(WRITE ${header} "${HEADER_CONTENT}")
-            message(STATUS "pten header path compat processing complete: ${header}")
+            message(STATUS "phi header path compat processing complete: ${header}")
         endif()
     endforeach()
 endif()
 endfunction()
 
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental)
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api)
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext)
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include)
-pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
 
 # In order to be compatible with the original behavior, the header file name needs to be changed
 file(RENAME ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/extension.h
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 5daaf29ae2895234374c736b39b2bacf50051562..41652f8b6ed6f717ad8a571be8e7a16408b34504 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,5 +1,5 @@
-cc_library(processgroup SRCS ProcessGroup.cc DEPS pten pten_api eager_api)
+cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
 
 if(WITH_NCCL)
-    cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context pten pten_api eager_api)
+    cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
 endif()
diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc
index 9f2a8eb24533d12ca289543ee7f75d2c05f9b2a3..2009ec772e1cf66d3997e3f4be8f2e67bf2c32e3 100644
--- a/paddle/fluid/distributed/ps/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -238,7 +238,7 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
 
   void* tensor_data = tensor->mutable_data(
       place,
-      framework::TransToPtenDataType(VarMessageToVarType(msg.data_type())));
+      framework::TransToPhiDataType(VarMessageToVarType(msg.data_type())));
 
   // IO Buffer
   if (platform::is_cpu_place(place)) {
@@ -281,7 +281,7 @@ void DeserializeSelectedRows(
   tensor->Resize(phi::make_ddim(vec_dim));
   void* tensor_data = tensor->mutable_data(
       place,
-      framework::TransToPtenDataType(VarMessageToVarType(msg.data_type())));
+      framework::TransToPhiDataType(VarMessageToVarType(msg.data_type())));
   // IO Buffer
   if (platform::is_cpu_place(place)) {
     unsigned long data_len;                                 // NOLINT
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 711c46e995286e7369a42738a14eae86605a3e79..5e16ab2b391d0223a8b6fd9bae78cced9d4e2f11 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
+set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps dygraph_function dygraph_node)
 
@@ -10,11 +10,11 @@ endif()
 add_subdirectory(api)
 add_subdirectory(accumulation)
 
-cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
+cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
 
-cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api)
-cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
+cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api)
+cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
 cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/eager/accumulation/CMakeLists.txt b/paddle/fluid/eager/accumulation/CMakeLists.txt
index 632e289ba230871fd5630d674767b32d9f7b8b3f..43ca707f4f6fbe76c234318d19791d512eb3152b 100644
--- a/paddle/fluid/eager/accumulation/CMakeLists.txt
+++ b/paddle/fluid/eager/accumulation/CMakeLists.txt
@@ -1 +1 @@
-cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator pten pten_api grad_node_info)
+cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator phi phi_api grad_node_info)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 2e377e43ca3ec96c183a7b51830b71210d5d0290..3a2ec403c0a59aaa23decc72fb9581b5a7f78343 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -76,13 +76,13 @@ operator()(
 }
 
 void GradNodeAccumulation::RegisterReduceHook(
-    const std::function<void(void)>& hook) {
-  reduce_hooks_.emplace_back(hook);
+    std::shared_ptr<TensorVoidHook>&& hook) {
+  reduce_hooks_.emplace_back(std::move(hook));
 }
 
 void GradNodeAccumulation::ApplyReduceHooks() {
   for (auto& hook : reduce_hooks_) {
-    hook();
+    (*hook)();
   }
 }
 }  // namespace egr
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 787149ab305263fdbef2866e901e8af5116bc268..734cabdc3dc914349e2ad30b657bfb6542a7472a 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 
 namespace egr {
 
@@ -39,7 +40,7 @@ class GradNodeAccumulation : public GradNodeBase {
   /**
    * Register ReduceHook
    * **/
-  void RegisterReduceHook(const std::function<void(void)>& hook);
+  void RegisterReduceHook(std::shared_ptr<TensorVoidHook>&& hook);
 
   /**
    * Apply ReduceHook here
@@ -54,7 +55,7 @@ class GradNodeAccumulation : public GradNodeBase {
       const paddle::experimental::Tensor&)>
       retain_grad_hook_;
 
-  std::vector<std::function<void(void)>> reduce_hooks_;
+  std::vector<std::shared_ptr<TensorVoidHook>> reduce_hooks_;
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
index e3fafb265ad9887a5683542d79ae07f30edee910..77d8ec57efcaa6c4e83a69f4b2a97b128b174389 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(scale_node SRCS scale_node.cc DEPS global_utils pten pten_api grad_node_info)
+cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info)
 
 if(NOT ON_INFER)
 cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps})
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 3dbfba0d9150f64afd1002fcf7f3e9365bf786d1..5a2595b9103e4d49845fa8938ee3577b6b3f3f06 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -33,36 +33,36 @@ static void ScaleDeviceDispatch(const phi::DenseTensor& dense_tensor,
                                 phi::DenseTensor* dense_out) {
   switch (dense_tensor.dtype()) {
     case phi::DataType::FLOAT64: {
-      phi::ScaleKernel<double, typename paddle::framework::ConvertToPtenContext<
+      phi::ScaleKernel<double, typename paddle::framework::ConvertToPhiContext<
                                    DeviceContext>::TYPE>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
           bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
       break;
     }
     case phi::DataType::FLOAT32: {
-      phi::ScaleKernel<float, typename paddle::framework::ConvertToPtenContext<
+      phi::ScaleKernel<float, typename paddle::framework::ConvertToPhiContext<
                                   DeviceContext>::TYPE>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
           bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
       break;
     }
     case phi::DataType::INT64: {
-      phi::ScaleKernel<int64_t, typename paddle::framework::
-                                    ConvertToPtenContext<DeviceContext>::TYPE>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+      phi::ScaleKernel<int64_t, typename paddle::framework::ConvertToPhiContext<
+                                    DeviceContext>::TYPE>(
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
           bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
       break;
     }
     case phi::DataType::INT32: {
-      phi::ScaleKernel<int32_t, typename paddle::framework::
-                                    ConvertToPtenContext<DeviceContext>::TYPE>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+      phi::ScaleKernel<int32_t, typename paddle::framework::ConvertToPhiContext<
+                                    DeviceContext>::TYPE>(
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
           bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
index 8ede139ddc0446ddab4404ae2f749a3c84748d73..60b35340eabd1fa03f59cc0b7ea278351be96df1 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(eager_scale SRCS scale.cc DEPS pten_api pten autograd_meta scale_node)
+cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node)
 
 if(NOT ON_INFER)
 cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps})
diff --git a/paddle/fluid/eager/api/utils/CMakeLists.txt b/paddle/fluid/eager/api/utils/CMakeLists.txt
index 3a4f0ba320358ed1dbd0a493f7263aeae5633f87..c34df3972c23e14b8f15517d86091ccc2ae6d0fc 100644
--- a/paddle/fluid/eager/api/utils/CMakeLists.txt
+++ b/paddle/fluid/eager/api/utils/CMakeLists.txt
@@ -1,3 +1,3 @@
-cc_library(tensor_utils SRCS tensor_utils.cc DEPS pten pten_api autograd_meta grad_node_info accumulation_node)
-cc_library(hook_utils SRCS hook_utils.cc DEPS pten tensor_utils autograd_meta grad_node_info utils accumulation_node)
+cc_library(tensor_utils SRCS tensor_utils.cc DEPS phi phi_api autograd_meta grad_node_info accumulation_node)
+cc_library(hook_utils SRCS hook_utils.cc DEPS phi tensor_utils autograd_meta grad_node_info utils accumulation_node)
 cc_library(global_utils SRCS global_utils.cc DEPS place tracer)
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index 748afe6d1f313daacbbe276b2a00a9687402e617..c7927716300528fdfa571de720ce12e7246b5f1d 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -22,19 +22,19 @@
 namespace egr {
 namespace egr_utils_api {
 
-void RegisterGradientHookForTensor(
+int64_t RegisterGradientHookForTensor(
     const paddle::experimental::Tensor& tensor,
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook) {
+    std::shared_ptr<egr::TensorHook>&& hook) {
   // Find grad_node and out_rank from AutogradMeta
   std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
   auto rank_info = EagerUtils::unsafe_autograd_meta(tensor)->OutRankInfo();
 
-  grad_node->RegisterGradientHook(rank_info.first, rank_info.second, hook);
+  return grad_node->RegisterGradientHook(rank_info.first, rank_info.second,
+                                         std::move(hook));
 }
 
 void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
-                                 const std::function<void(void)>& hook) {
+                                 std::shared_ptr<egr::TensorVoidHook>&& hook) {
   if (IsLeafTensor(tensor)) {
     VLOG(6) << "Register ReduceHook for leaf tensor";
     std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
@@ -45,7 +45,7 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
                                         "with type: GradNodeAccumulation"));
     auto accumulation_grad_node =
         std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node);
-    accumulation_grad_node->RegisterReduceHook(hook);
+    accumulation_grad_node->RegisterReduceHook(std::move(hook));
   } else {
     PADDLE_THROW(paddle::platform::errors::Fatal(
         "Only can register reduce hook for leaf Tensor."));
@@ -65,28 +65,27 @@ static void RetainGradForRegularNode(
       meta->WeakGrad();
 
   // Define Hook
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
-        if (!weak_grad_tensor.expired()) {
-          auto grad_tensor = weak_grad_tensor.lock();
-          if (t.defined()) {
-            VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
-            // Simply Copy impl() to grad_tensor
-            grad_tensor->set_impl(t.impl());
-            return *grad_tensor.get();
-          } else {
-            VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
-            return paddle::experimental::Tensor();
-          }
-        } else {
-          VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
-          return paddle::experimental::Tensor();
-        }
-      };
+  auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
+    if (!weak_grad_tensor.expired()) {
+      auto grad_tensor = weak_grad_tensor.lock();
+      if (t.defined()) {
+        VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
+        // Simply Copy impl() to grad_tensor
+        grad_tensor->set_impl(t.impl());
+        return *grad_tensor.get();
+      } else {
+        VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
+        return paddle::experimental::Tensor();
+      }
+    } else {
+      VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
+      return paddle::experimental::Tensor();
+    }
+  };
 
   // Append to GradientHooks
-  RegisterGradientHookForTensor(tensor, hook);
+  RegisterGradientHookForTensor(tensor,
+                                std::make_shared<egr::CppTensorHook>(hook));
 }
 
 void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
diff --git a/paddle/fluid/eager/api/utils/hook_utils.h b/paddle/fluid/eager/api/utils/hook_utils.h
index 4c4ecc9fb801de67778cb7209a721dee3572bdf0..b36ef81125a8ca5ef1f2720b73021ae82395a9aa 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.h
+++ b/paddle/fluid/eager/api/utils/hook_utils.h
@@ -16,17 +16,17 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/api/all.h"
 namespace egr {
 namespace egr_utils_api {
 
-void RegisterGradientHookForTensor(
+int64_t RegisterGradientHookForTensor(
     const paddle::experimental::Tensor& tensor,
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook);
+    std::shared_ptr<egr::TensorHook>&& hook);
 
 void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
-                                 const std::function<void(void)>& hook);
+                                 std::shared_ptr<egr::TensorVoidHook>&& hook);
 void RetainGradForTensor(const paddle::experimental::Tensor& tensor);
 
 }  // namespace egr_utils_api
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index 628c0c500b3c4ade711f3b7ba6a9fa4b6b69a7c6..77c39d1b0a37c3946e4c170484118a5fb6f79170 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -22,7 +22,7 @@
 #include "paddle/phi/api/all.h"
 
 #include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace egr {
@@ -43,7 +43,7 @@ paddle::experimental::Tensor CreateTensorWithValue(
     bool is_leaf) {
   paddle::experimental::Tensor out = paddle::experimental::full(
       phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype,
-      phi::TransToPtenBackend(place));
+      phi::TransToPhiBackend(place));
 
   auto meta = EagerUtils::autograd_meta(&out);
   if (is_leaf) {
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index e1f4d6ee9a129e41b7e01fec7f414d8c8fbc880f..a8e0ed7a41a043e12332ad347f673a6c27e5f1ec 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 #define NUM_CREATED_DUP_INPUTS 4
@@ -544,7 +544,7 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
   // since only OperatorWithKernel can run in dygraph mode.
   auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
   if (!all_kernels.count(op_type) &&
-      !phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+      !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) {
     return false;
   }
 
@@ -2040,12 +2040,13 @@ static std::string GenerateGradNodeCCContents(
 
   const char* BWD_RETURN_TEMPLATE =
       "  std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads = "
-      "egr::GradNodeBase::ApplyGradientHooks(grads);\n"
+      "GradNode%s::ApplyGradientHooks(grads);\n"
       "  std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
       "  %s\n"
       "  return outputs;\n";
-  generated_grad_function_body = paddle::string::Sprintf(
-      BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body);
+  generated_grad_function_body =
+      paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
+                              generated_grad_function_body);
 
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 5a536067dbe4955efef136f5e5ba75b84d87f187..9329dc5ffc9dd0faa36b8ff6a8373387bc2678c7 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -143,7 +143,7 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
         fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
         GetForwardFunctionName(fwd_api_name), dygraph_function_call_str)
 
-    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}},\n"
+    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
 
     return python_c_function_str, python_c_function_reg_str
 
@@ -197,7 +197,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
     """
 
     core_ops_infos_registry = """
-    {\"get_final_state_core_ops_args_info\",
+    ,{\"get_final_state_core_ops_args_info\",
     (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
     \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
     {\"get_final_state_core_ops_args_type_info\",
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 42a3a13e5f70aef673e17521bf2fc57ed3869550..41e57ef1a15b0181c23b8e3f4b1bba12218a24f7 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -14,10 +14,10 @@
 
 #pragma once
 // framework deps
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
-// pten deps
+// Phi deps
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/api_declare.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
@@ -31,7 +31,7 @@
  * provide variable in
  * paddle::framework::ExecutionContext to support it. We should remove this as
  * soon as we finish our latest
- * Pten Lib, and use paddle::experimental::Tensor instead.
+ * Phi Lib, and use paddle::experimental::Tensor instead.
  *
  * Note: Keep this class as clean as possible.
  * This class should only support method declared in
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 27c376b4c80c6b4256d3e34ae98f39545551e19a..35416281f188892ec11413a19abad9b3e5c29e76 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -210,22 +210,22 @@ const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
   return adj_edges_;
 }
 
-void GradNodeBase::RegisterGradientHook(
-    size_t slot_id, size_t rank,
-    const std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook) {
-  gradient_hooks_.emplace_back(std::make_tuple(slot_id, rank, hook));
+int64_t GradNodeBase::RegisterGradientHook(
+    size_t slot_id, size_t rank, std::shared_ptr<egr::TensorHook>&& hook) {
+  gradient_hooks_.emplace(next_hook_id_,
+                          std::make_tuple(slot_id, rank, std::move(hook)));
+  return next_hook_id_++;
 }
 
 std::vector<std::vector<paddle::experimental::Tensor>>
 GradNodeBase::ApplyGradientHooks(
     const std::vector<std::vector<paddle::experimental::Tensor>>& tensors) {
   std::vector<std::vector<paddle::experimental::Tensor>> outs(tensors.size());
-  for (auto& tuple : gradient_hooks_) {
-    size_t slot_id = std::get<0>(tuple);
-    size_t rank = std::get<1>(tuple);
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook = std::get<2>(tuple);
+  for (auto& hook_pair : gradient_hooks_) {
+    size_t slot_id = std::get<0>(hook_pair.second);
+    size_t rank = std::get<1>(hook_pair.second);
+
+    auto hook = std::get<2>(hook_pair.second);
 
     PADDLE_ENFORCE(slot_id < tensors.size(),
                    paddle::platform::errors::Fatal(
@@ -242,12 +242,11 @@ GradNodeBase::ApplyGradientHooks(
     slot_out.resize(tensors[slot_id].size());
     paddle::experimental::Tensor& out = slot_out[rank];
     if (!out.defined() || !out.initialized()) {
-      VLOG(8) << "Run Hook for tensor: " << tensors[slot_id][rank].name();
-      out = hook(tensors[slot_id][rank]);
+      out = (*hook)(tensors[slot_id][rank]);
     } else {
       // If more than one hook is registered, the input to the next hook func
       // should be the output of the previous hook
-      out = hook(out);
+      out = (*hook)(out);
     }
   }
 
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index f699f9ab28e2d37c893e7a4fdec9acfa6c5a280f..eeac1cca4acf33190ce30613e4a86e99a95b651b 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/api/all.h"
 
 namespace egr {
@@ -135,14 +136,24 @@ class GradNodeBase {
   /**
    * Register GradientHook
    * **/
-  void RegisterGradientHook(size_t slot_id, size_t rank,
-                            const std::function<paddle::experimental::Tensor(
-                                const paddle::experimental::Tensor&)>& hook);
+  int64_t RegisterGradientHook(size_t slot_id, size_t rank,
+                               std::shared_ptr<egr::TensorHook>&& hook);
+
+  /**
+  * Remove GradientHook
+  * **/
+  bool RemoveGradientHook(const int64_t& hook_id) {
+    auto remove_cnt = gradient_hooks_.erase(hook_id);
+    if (remove_cnt == 0) {
+      return false;
+    }
+    return true;
+  }
 
   /**
    * Apply GradientHook
    * **/
-  inline bool GradientHooksRegistered() { return gradient_hooks_.size() != 0; }
+  inline bool GradientHooksRegistered() { return !gradient_hooks_.empty(); }
 
   std::vector<std::vector<paddle::experimental::Tensor>> ApplyGradientHooks(
       const std::vector<std::vector<paddle::experimental::Tensor>>& tensors);
@@ -166,12 +177,14 @@ class GradNodeBase {
   // Gradient Hooks
   // Customer may register a list of hooks which will be called in order during
   // backward
-  // Each entry consists one pair of <out_rank, std::function>
-  std::vector<std::tuple<
-      /* slot id */ size_t, /* rank */ size_t,
-      /* hook */ std::function<paddle::experimental::Tensor(
-          const paddle::experimental::Tensor&)>>>
+  // Each entry consists one pair of
+  // <hook_id, <out_rank, std::shared_ptr<TensorHook>>>
+  std::map<int64_t, std::tuple<
+                        /* slot id */ size_t, /* rank */ size_t,
+                        /* hook */ std::shared_ptr<TensorHook>>>
       gradient_hooks_;
+
+  int64_t next_hook_id_{0};
 };
 
 class Edge {
diff --git a/paddle/fluid/eager/hooks.h b/paddle/fluid/eager/hooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..097150cf5ed59d0fdb9dda49e03eb75e6f1b4207
--- /dev/null
+++ b/paddle/fluid/eager/hooks.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/phi/api/include/tensor.h"
+namespace egr {
+
+class TensorHook {
+ public:
+  virtual ~TensorHook() = default;
+  virtual paddle::experimental::Tensor operator()(
+      const paddle::experimental::Tensor& var) = 0;
+};
+
+class TensorVoidHook {
+ public:
+  virtual ~TensorVoidHook() = default;
+  virtual void operator()() = 0;
+};
+
+class CppTensorHook : public TensorHook {
+ public:
+  explicit CppTensorHook(std::function<paddle::experimental::Tensor(
+                             const paddle::experimental::Tensor&)>&& fn)
+      : fn_(std::move(fn)) {}
+
+  paddle::experimental::Tensor operator()(
+      const paddle::experimental::Tensor& var) override {
+    return fn_(var);
+  }
+
+ private:
+  std::function<paddle::experimental::Tensor(
+      const paddle::experimental::Tensor&)>
+      fn_;
+};
+
+class CppTensorVoidHook : public TensorVoidHook {
+ public:
+  explicit CppTensorVoidHook(std::function<void()>&& fn) : fn_(std::move(fn)) {}
+
+  void operator()() override { return fn_(); }
+
+ private:
+  std::function<void()> fn_;
+};
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index 880bd2684102710d9d432c7186d007f7e155badd..28682ab0fe094df6d27eb27e9118e6576685c95a 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/fluid/eager/utils.h"
 
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/kernel_registry.h"
 
@@ -116,7 +117,8 @@ TEST(AccumulationNode, Tensor) {
     VLOG(6) << "Running Reduce Hook";
   };
 
-  node->RegisterReduceHook(reduce_hook_1);
+  node->RegisterReduceHook(
+      std::make_shared<egr::CppTensorVoidHook>(reduce_hook_1));
 
   // operator()
   paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0];
@@ -141,7 +143,8 @@ TEST(AccumulationNode, Tensor) {
     ret_et0_ptr[0] = 100.0;  // set to 100.0
     VLOG(6) << "Running Reduce Hook";
   };
-  node->RegisterReduceHook(reduce_hook_2);
+  node->RegisterReduceHook(
+      std::make_shared<egr::CppTensorVoidHook>(reduce_hook_2));
   node->ApplyReduceHooks();
 
   // Check ApplyReduceHooks result
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index aee6ee7488671930664e2accdea89a7d872c9583..e3db309c4016a512c5379fb352beb4af690a271e 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 
@@ -32,7 +33,7 @@ TEST(GradNodeInfo, GradSlotMeta) {
   CHECK_EQ(grad_slot.Size(), 2);
 }
 
-TEST(GradNodeInfo, GradNodeBase) {
+void TestGradNodeBase(bool is_remove_gradient_hook) {
   VLOG(6) << "Construct Grad Node";
   auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(
       /* val */ 5.0, /* in_num */ 2, /* out_num */ 2);
@@ -112,13 +113,25 @@ TEST(GradNodeInfo, GradNodeBase) {
     VLOG(6) << "Running Gradient Hook";
     return res;
   };
-  grad_test_node0->RegisterGradientHook(0, 0, gradient_hook);
-  // 5 + 6
+  int64_t hook_id = grad_test_node0->RegisterGradientHook(
+      0, 0, std::make_shared<egr::CppTensorHook>(gradient_hook));
+
+  if (is_remove_gradient_hook) {
+    // Remove GradientHook
+    grad_test_node0->RemoveGradientHook(hook_id);
+  }
+
+  // Check results
   auto grad_hook_res = grad_test_node0->ApplyGradientHooks(grads);
   CHECK_EQ(
       std::dynamic_pointer_cast<phi::DenseTensor>(grad_hook_res[0][0].impl())
           ->data<float>()[0],
-      11.0);
+      is_remove_gradient_hook ? 5.0 : 11.0);
+}
+
+TEST(GradNodeInfo, GradNodeBase) {
+  TestGradNodeBase(true);
+  TestGradNodeBase(false);
 }
 
 TEST(GradNodeInfo, Edge) {
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 752fd7812847c442f83d150eafa331360dfa8693..5a7bafb2fe37051c0ad054c130d77dd6e05319d2 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -27,6 +27,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
 namespace egr {
@@ -221,10 +222,6 @@ TEST(FwdBwdJoint, GradientHook) {
       phi::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
   egr_utils_api::RetainGradForTensor(tensor);
 
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = &hook_function;
-
   // 3. Run Forward
   // Run Forward Node 0
   float scale0 = 2.0;
@@ -232,24 +229,27 @@ TEST(FwdBwdJoint, GradientHook) {
   paddle::experimental::Tensor out0 =
       egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/,
                  true /*trace_backward*/);
-  egr_utils_api::RetainGradForTensor(out0);                  // hook: +5
-  egr_utils_api::RegisterGradientHookForTensor(out0, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out0);  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(
+      out0, std::make_shared<egr::CppTensorHook>(hook_function));  // hook: +5
 
   // Run Forward Node 1
   float scale1 = 5.0;
   float bias1 = 10.0;
   paddle::experimental::Tensor out1 = egr::scale(
       out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
-  egr_utils_api::RetainGradForTensor(out1);                  // hook: +5
-  egr_utils_api::RegisterGradientHookForTensor(out1, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out1);  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(
+      out1, std::make_shared<egr::CppTensorHook>(hook_function));  // hook: +5
 
   // Run Forward Node 2
   float scale2 = 10.0;
   float bias2 = 20.0;
   paddle::experimental::Tensor out2 = egr::scale(
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
-  egr_utils_api::RetainGradForTensor(out2);                  // hook: +5
-  egr_utils_api::RegisterGradientHookForTensor(out2, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out2);  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(
+      out2, std::make_shared<egr::CppTensorHook>(hook_function));  // hook: +5
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index fbc71168fe41697aa3175f1541350852a62a3220..9cda961741f55e9b4b7fc8dac61fe4a7c96567cf 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -28,6 +28,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
 namespace egr {
@@ -83,9 +84,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
   // Apply RetainGrad
   {
     // ScaleNode Hook: +3
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>
-        hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
     auto_grad_meta->SetGradNode(
@@ -96,7 +94,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
+    egr_utils_api::RegisterGradientHookForTensor(
+        target_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
     egr_utils_api::RetainGradForTensor(
         target_tensor);  // result: 1.0 + 3.0 = 4.0
     egr_utils_api::RetainGradForTensor(
@@ -107,9 +106,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
   {
     // AccumulationNode Hook: +3
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>
-        hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
 
@@ -126,7 +122,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
+    egr_utils_api::RegisterGradientHookForTensor(
+        leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
     egr_utils_api::RetainGradForTensor(
         leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
   }
@@ -161,9 +158,6 @@ TEST(RetainGrad, HookAfterRetainGrad) {
   // Apply RetainGrad
   {
     // ScaleNode Hook: +3
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>
-        hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
     auto_grad_meta->SetGradNode(
@@ -175,16 +169,14 @@ TEST(RetainGrad, HookAfterRetainGrad) {
             auto_grad_meta));
 
     egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
-    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
+    egr_utils_api::RegisterGradientHookForTensor(
+        target_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
   }
 
   // Retain Grad for leaf tensor1
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
   {
     // AccumulationNode Hook: +3
-    std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>
-        hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
     auto acc_node_ptr =
@@ -199,7 +191,8 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
+    egr_utils_api::RegisterGradientHookForTensor(
+        leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
   }
 
   RunBackward(target_tensors, {});
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index dbcfe704dbe1c31849b338dc4c1b9ea56e6ad667..15b2a62dca751859882e82d46acaa46f27c2c518 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -24,6 +24,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace egr {
@@ -54,7 +55,7 @@ paddle::experimental::Tensor hook_function(
   return ret;
 }
 
-TEST(Hook_intermidiate, Sigmoid) {
+void test_sigmoid(bool is_remove_gradient_hook) {
   // Prepare Device Contexts
   VLOG(6) << "Init Env";
   eager_test::InitEnv(paddle::platform::CPUPlace());
@@ -67,11 +68,6 @@ TEST(Hook_intermidiate, Sigmoid) {
       ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
       phi::DataLayout::NCHW, 0.0, true);
 
-  VLOG(6) << "Make Hook function";
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = &hook_function;
-
   VLOG(6) << "Make ReduceHook function";
   auto reduce_hook = [&](void) -> void {
     auto* t_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl())
@@ -85,10 +81,12 @@ TEST(Hook_intermidiate, Sigmoid) {
   egr_utils_api::RetainGradForTensor(tensor);
 
   VLOG(6) << "Register GradientHook for Tensor";
-  egr_utils_api::RegisterGradientHookForTensor(tensor, hook);
+  int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor(
+      tensor, std::make_shared<CppTensorHook>(hook_function));
 
   VLOG(6) << "Register ReduceHook for Tensor";
-  egr_utils_api::RegisterReduceHookForTensor(tensor, reduce_hook);
+  egr_utils_api::RegisterReduceHookForTensor(
+      tensor, std::make_shared<CppTensorVoidHook>(reduce_hook));
 
   VLOG(6) << "Runing Forward";
   auto output_tensor = sigmoid_dygraph_function(tensor, {});
@@ -98,11 +96,17 @@ TEST(Hook_intermidiate, Sigmoid) {
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
 
+  if (is_remove_gradient_hook) {
+    std::shared_ptr<GradNodeBase> grad_node_tmp = EagerUtils::grad_node(tensor);
+    grad_node_tmp->RemoveGradientHook(hook_id);
+  }
+
   VLOG(6) << "Runing Backward";
   RunBackward(target_tensors, {});
   VLOG(6) << "Finish Backward";
 
-  eager_test::CompareGradTensorWithValue<float>(tensor, 0.25 + 3);
+  eager_test::CompareGradTensorWithValue<float>(
+      tensor, is_remove_gradient_hook ? 0.25 : 0.25 + 3.0);
 
   VLOG(6) << "Checking ReduceHook results";
   for (int i = 0; i < tensor.numel(); i++) {
@@ -113,7 +117,7 @@ TEST(Hook_intermidiate, Sigmoid) {
   VLOG(6) << "After Tests";
 }
 
-TEST(Hook_intermidiate, ElementwiseAdd) {
+void test_elementwiseAdd(bool is_remove_gradient_hook) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
@@ -132,11 +136,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) {
       ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
       phi::DataLayout::NCHW, 2.0, true);
 
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = &hook_function;
-
-  auto reduce_hook = [&](void) -> void {
+  auto reduce_hook = [&]() -> void {
     auto* t_ptr =
         std::dynamic_pointer_cast<phi::DenseTensor>(Y.impl())->data<float>();
     for (int i = 0; i < Y.numel(); i++) {
@@ -145,18 +145,26 @@ TEST(Hook_intermidiate, ElementwiseAdd) {
   };
 
   egr_utils_api::RetainGradForTensor(Y);
-  egr_utils_api::RegisterGradientHookForTensor(Y, hook);
-  egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook);
+  int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor(
+      Y, std::make_shared<CppTensorHook>(hook_function));
+  egr_utils_api::RegisterReduceHookForTensor(
+      Y, std::make_shared<CppTensorVoidHook>(reduce_hook));
 
   auto output_tensor = elementwise_add_dygraph_function(X, Y, {});
 
   eager_test::CompareTensorWithValue<float>(output_tensor, 5);
-
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
+
+  if (is_remove_gradient_hook) {
+    std::shared_ptr<GradNodeBase> grad_node_tmp = EagerUtils::grad_node(Y);
+    grad_node_tmp->RemoveGradientHook(hook_id);
+  }
+
   RunBackward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
-  eager_test::CompareGradTensorWithValue<float>(Y, 4.0);
+  eager_test::CompareGradTensorWithValue<float>(
+      Y, is_remove_gradient_hook ? 1.0 : 1.0 + 3.0);
 
   // Checking ReduceHook results
   for (int i = 0; i < Y.numel(); i++) {
@@ -166,7 +174,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) {
   }
 }
 
-TEST(Hook_intermidiate, Matmul_v2) {
+void test_matmul(bool is_remove_gradient_hook) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
@@ -185,10 +193,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
       ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
       phi::DataLayout::NCHW, 2.0, true);
 
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = &hook_function;
-
   auto reduce_hook = [&](void) -> void {
     auto* t_ptr =
         std::dynamic_pointer_cast<phi::DenseTensor>(Y.impl())->data<float>();
@@ -198,19 +202,27 @@ TEST(Hook_intermidiate, Matmul_v2) {
   };
 
   egr_utils_api::RetainGradForTensor(Y);
-  egr_utils_api::RegisterGradientHookForTensor(Y, hook);
-  egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook);
+  int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor(
+      Y, std::make_shared<CppTensorHook>(hook_function));
+  egr_utils_api::RegisterReduceHookForTensor(
+      Y, std::make_shared<CppTensorVoidHook>(reduce_hook));
 
   auto output_tensor = matmul_v2_dygraph_function(
       X, Y, {{"trans_x", false}, {"trans_y", false}});
 
   eager_test::CompareTensorWithValue<float>(output_tensor, 96);
-
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
+
+  if (is_remove_gradient_hook) {
+    std::shared_ptr<GradNodeBase> grad_node_tmp = EagerUtils::grad_node(Y);
+    grad_node_tmp->RemoveGradientHook(hook_id);
+  }
+
   RunBackward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
-  eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4 + 3);
+  eager_test::CompareGradTensorWithValue<float>(
+      Y, is_remove_gradient_hook ? 3.0 * 4 : 3.0 * 4 + 3);
 
   // Checking ReduceHook results
   for (int i = 0; i < Y.numel(); i++) {
@@ -219,6 +231,22 @@ TEST(Hook_intermidiate, Matmul_v2) {
         static_cast<float>(100.0f));
   }
 }
+
+TEST(Hook_intermidiate, Sigmoid) {
+  // True or false represents whether to call RemoveGradientHook
+  test_sigmoid(true);
+  test_sigmoid(false);
+}
+
+TEST(Hook_intermidiate, ElementwiseAdd) {
+  test_elementwiseAdd(true);
+  test_elementwiseAdd(false);
+}
+
+TEST(Hook_intermidiate, Matmul_v2) {
+  test_matmul(true);
+  test_matmul(false);
+}
 }  // namespace egr
 
 USE_OP(sigmoid);
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 7464ad74135853a6d5f6b0f6eec3b950f527a599..a7e5931f1f9bc66006fb1a37836be1eda371953e 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -23,7 +23,7 @@
 #include "paddle/phi/core/tensor_meta.h"
 
 #include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
 PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, true,
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 7d527e24a0079e8e8fc9f591ee35131c25a38a8b..14aecb5fd43c49ece1f79cb9c8e2b70e9d07df07 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -193,19 +193,19 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va
 cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 
 IF(WITH_XPU)
-cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info xpu_op_list)
+cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info xpu_op_list)
 ELSE()
-cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info)
+cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info)
 ENDIF()
 
 IF(WITH_XPU)
 cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
-    pten pten_utils kernel_factory infershape_utils op_utils)
+    phi phi_utils kernel_factory infershape_utils op_utils)
 ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
-    pten pten_utils kernel_factory infershape_utils op_utils)
+    phi phi_utils kernel_factory infershape_utils op_utils)
 ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@@ -412,7 +412,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
-cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place pten var_type_traits pten pten_api_utils op_info shape_inference)
+cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place phi var_type_traits phi phi_api_utils op_info shape_inference)
 cc_test(infershape_utils_test SRCS infershape_utils_test.cc DEPS infershape_utils infermeta_utils meta_tensor)
 
 # Get the current working branch
@@ -436,8 +436,8 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 
 configure_file(commit.h.in commit.h)
 
-cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
-cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry pten_custom_kernel pten_tensor_raw)
+cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw)
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
@@ -450,7 +450,7 @@ if(WITH_TESTING AND TEST selected_rows_utils_test)
 endif()
 
 cc_test(scope_guard_test SRCS scope_guard_test.cc)
-cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils)
+cc_test(phi_utils_test SRCS phi_utils_test.cc DEPS phi_utils)
 
 if(WITH_GPU OR WITH_ROCM)
   cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index 81b6917587df9282d3ff59180e6fc079379cef60..ae3d8379bdbf779e2cf82d27c18997f82cb92095 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/pybind/pybind.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/convert_utils.cc b/paddle/fluid/framework/convert_utils.cc
index 23cf4324086bd48f7a2a429bde26f7303e8d34b3..df5cc6d82042c262467b35f6a7cbe097a4ad7776 100644
--- a/paddle/fluid/framework/convert_utils.cc
+++ b/paddle/fluid/framework/convert_utils.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-paddle::experimental::DataType TransToPtenDataType(
+paddle::experimental::DataType TransToPhiDataType(
     const paddle::framework::proto::VarType::Type& dtype) {
   // Set the order of case branches according to the frequency with
   // the data type is used
diff --git a/paddle/fluid/framework/convert_utils.h b/paddle/fluid/framework/convert_utils.h
index c94b5b2311c5202832e5fe00c702e14c56ada9b9..da2af86c77c477c3c70b220b47bc073b47645a5d 100644
--- a/paddle/fluid/framework/convert_utils.h
+++ b/paddle/fluid/framework/convert_utils.h
@@ -32,7 +32,7 @@ namespace framework {
 using DataType = paddle::experimental::DataType;
 using DataLayout = paddle::experimental::DataLayout;
 
-DataType TransToPtenDataType(
+DataType TransToPhiDataType(
     const paddle::framework::proto::VarType::Type& dtype);
 
 paddle::framework::proto::VarType::Type TransToProtoVarType(
diff --git a/paddle/fluid/framework/convert_utils_test.cc b/paddle/fluid/framework/convert_utils_test.cc
index 51b431f4b4a8a080f312f7d8bfdf12c1cdc44e4b..140806dfd7c5e1ae2746f3d116f418fea16fa1f3 100644
--- a/paddle/fluid/framework/convert_utils_test.cc
+++ b/paddle/fluid/framework/convert_utils_test.cc
@@ -43,35 +43,35 @@ TEST(ConvertUtils, DataType) {
   CHECK(paddle::framework::TransToProtoVarType(paddle::DataType::FLOAT16) ==
         paddle::framework::proto::VarType::FP16);
   // proto -> enum
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::FP64) ==
         paddle::DataType::FLOAT64);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::FP32) ==
         paddle::DataType::FLOAT32);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::INT64) ==
         paddle::DataType::INT64);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::INT32) ==
         paddle::DataType::INT32);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::INT8) == paddle::DataType::INT8);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::UINT8) ==
         paddle::DataType::UINT8);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::INT16) ==
         paddle::DataType::INT16);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::COMPLEX64) ==
         paddle::DataType::COMPLEX64);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::COMPLEX128) ==
         paddle::DataType::COMPLEX128);
-  CHECK(paddle::framework::TransToPtenDataType(
+  CHECK(paddle::framework::TransToPhiDataType(
             paddle::framework::proto::VarType::FP16) ==
         paddle::DataType::FLOAT16);
 }
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 597265bb2473fd14108b4fa11e7ae93957c4268b..b9e3bee25f6b5377dde7b525138643964fd8366a 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/string/string_helper.h"
@@ -779,13 +779,13 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
           for (size_t i = 0; i < ctx->InputSize(in_name); ++i) {
             auto dtype = ctx->GetInputDataType(in_name, i);
             vec_custom_dtype.emplace_back(
-                paddle::framework::TransToPtenDataType(dtype));
+                paddle::framework::TransToPhiDataType(dtype));
           }
           vec_input_dtypes.emplace_back(vec_custom_dtype);
         } else {
           auto dtype = ctx->GetInputDataType(in_name);
           input_dtypes.emplace_back(
-              paddle::framework::TransToPtenDataType(dtype));
+              paddle::framework::TransToPhiDataType(dtype));
         }
       }
 
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index cf9e3de6c1a58a277e4508442c39a882ffa506b2..4757eb60f4361cffd9354afd4a8bf4bf99e86eb3 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 7152004b63de6deab22988a79917b536a0623c81..15cf30c1cf352324b57b8ca7bfcdf9d2d2640aea 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -28,7 +28,7 @@ TEST(DataType, float16) {
 
   Tensor tensor;
   CPUPlace cpu;
-  tensor.mutable_data(cpu, f::TransToPtenDataType(dtype));
+  tensor.mutable_data(cpu, f::TransToPhiDataType(dtype));
 
   // test fp16 tensor
   EXPECT_EQ(f::TransToProtoVarType(tensor.dtype()),
@@ -51,7 +51,7 @@ TEST(DataType, bfloat16) {
 
   Tensor tensor;
   CPUPlace cpu;
-  tensor.mutable_data(cpu, f::TransToPtenDataType(dtype));
+  tensor.mutable_data(cpu, f::TransToPhiDataType(dtype));
 
   // test bf16 tensor
   EXPECT_EQ(f::TransToProtoVarType(tensor.dtype()),
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 1cf69a1a3d652a49226447c5559613378bd3ee17..1b2b24762894c0d72e75f0c4d20531e21f146cfd 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -231,6 +231,8 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
     OpHandleBase *op,
     const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
   ++remaining_;
+  platform::RecordEvent("WorkQueue::AddTask",
+                        platform::TracerEventType::UserDefined, 10 /*level*/);
   this->pool_->enqueue([=] {
     std::deque<OpHandleBase *> op_queue;
     op_queue.push_front(op);
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 47ab1e0fc030a6897162a99e8eb4da5e34541c79..06019372a7323b3c61c067638da19b847eba9031 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 8c3c1e015262b70efb575b0d3a5ebcd662459170..84dcdad78298acbd74b2f2d23e81ceba4bd71a72 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -161,7 +161,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   tensor->set_lod(lod);
 
   void* tensor_data = tensor->mutable_data(
-      place, framework::TransToPtenDataType(ToVarType(req_var.data_type())));
+      place, framework::TransToPhiDataType(ToVarType(req_var.data_type())));
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
@@ -202,7 +202,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   tensor->set_lod(lod);
 
   void* tensor_data = tensor->mutable_data(
-      place, framework::TransToPtenDataType(ToVarType(req_var.data_type())));
+      place, framework::TransToPhiDataType(ToVarType(req_var.data_type())));
 
 #ifdef PADDLE_WITH_XPU
   memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index 8aafd3459ed1a3d1673e482016c550e69c74a6cd..b6759bb2e6fe6c5a3688f3d72e84aabf0c1d2717 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -38,7 +38,7 @@ void SetMicroId(paddle::framework::Scope* scope,
   std::vector<int> dims{1};
   tensor->Resize(phi::make_ddim(dims));
   void* tensor_data = tensor->mutable_data(
-      place, framework::TransToPtenDataType(framework::proto::VarType::FP32));
+      place, framework::TransToPhiDataType(framework::proto::VarType::FP32));
   if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
     std::vector<char> temp;
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 4bec1baeaaee94942be33a86ff2165dd98da5818..e14b91d935d05c12442f3d0205c1e97df9697ec3 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
@@ -144,7 +144,7 @@ class CompatMetaTensor : public phi::MetaTensor {
       }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
-      return paddle::framework::TransToPtenDataType(var->GetDataType());
+      return paddle::framework::TransToPhiDataType(var->GetDataType());
     }
   }
 
@@ -341,24 +341,37 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
           }
           if (infershape_inputs.size() != 1) {
             infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePtenScalarArrayFromVarList(vars)));
+                std::move(experimental::MakePhiScalarArrayFromVarList(vars)));
           } else {
             infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePtenScalarArrayFromVar(*vars[0])));
+                std::move(experimental::MakePhiScalarArrayFromVar(*vars[0])));
           }
         } else {
           // If is not in runtime, we will set default value(-1) for ScalarArray
-          int64_t num_ele = 1;
+          int64_t num_ele = 0;
           std::vector<VarDesc*> vars;
           vars.reserve(infershape_inputs.size());
           for (size_t i = 0; i < infershape_inputs.size(); i++) {
             vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
           }
-          for (auto& var : vars) {
-            const auto& tensor_dims = var->GetShape();
+
+          if (vars.size() == 1) {
+            num_ele = 1;
+            const auto& tensor_dims = vars[0]->GetShape();
             for (size_t i = 0; i < tensor_dims.size(); ++i) {
               num_ele *= tensor_dims[i];
             }
+          } else {
+            for (auto& var : vars) {
+              const auto& tensor_dims = var->GetShape();
+              PADDLE_ENFORCE_EQ(tensor_dims.size(), 1,
+                                platform::errors::InvalidArgument(
+                                    "The shape is constructed by multi-tensor, "
+                                    "every tensor's dims should be 1. But your "
+                                    "shape has tensor that dims is %s.",
+                                    tensor_dims.size()));
+              num_ele += tensor_dims[0];
+            }
           }
           phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
           tensor_attr.SetFromTensor(true);
@@ -406,7 +419,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
           if (ctx->IsRuntime()) {
             Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
             infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePtenScalarFromVar(*var)));
+                std::move(experimental::MakePhiScalarFromVar(*var)));
           } else {
             phi::Scalar tensor_scalar(-1);
             tensor_scalar.SetFromTensor(true);
@@ -468,7 +481,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             BOOST_GET_CONST(std::vector<std::string>, attr));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(phi::DataType))) {
-        auto data_type = paddle::framework::TransToPtenDataType(
+        auto data_type = paddle::framework::TransToPhiDataType(
             static_cast<framework::proto::VarType::Type>(
                 BOOST_GET_CONST(int, attr)));
         infer_meta_context.EmplaceBackAttr(data_type);
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index ec5d48b3093f7c73bffa0196ccd75e11a89baeac..26ee02ff1812d2e73d0be3bed762d1a4ae4ac6c7 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -276,13 +276,13 @@ bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU(
   bool support_gpu = false;
   auto &kernel_factory = phi::KernelFactory::Instance();
   auto kernel_key_map =
-      kernel_factory.SelectKernelMap(phi::TransToPtenKernelName(op_type));
+      kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type));
   bool has_op_kernel = kernel_key_map.size() > 0 ? true : false;
   for (auto &kernel : kernel_key_map) {
-    if (platform::is_gpu_place(phi::TransToPtenPlace(kernel.first.backend()))) {
+    if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) {
       support_gpu = true;
     } else if (platform::is_cpu_place(
-                   phi::TransToPtenPlace(kernel.first.backend()))) {
+                   phi::TransToPhiPlace(kernel.first.backend()))) {
       support_cpu = true;
     }
   }
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index dafcc9c4e16a3ee43df17c1c0d650288c31b18b8..e9850483ebe913e298dc7501ed4155fb0dfc2879 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -96,7 +96,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
   auto x = scope->Var(var_name);
   auto tensor = x->GetMutable<LoDTensor>();
   tensor->mutable_data(place,
-                       framework::TransToPtenDataType(proto::VarType::FP32), 1);
+                       framework::TransToPhiDataType(proto::VarType::FP32), 1);
 }
 
 void MainTest(bool convWithExistingBias) {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 3a78c229bd8fa83ff4c4d96ff270f20f131ab52b..889417b78c8641060b8ad89219749d8400558c6a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -126,7 +126,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
   auto x = scope->Var(var_name);
   auto tensor = x->GetMutable<LoDTensor>();
   tensor->mutable_data(place,
-                       framework::TransToPtenDataType(proto::VarType::FP32), 1);
+                       framework::TransToPhiDataType(proto::VarType::FP32), 1);
 }
 
 void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index e00bb84e35c09eb987b2470c041545cf7f53e4ea..0506bfaf447ac68368d7d8f2a87014a6234c444c 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -526,7 +526,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
   auto x = scope->Var(var_name);
   auto tensor = x->GetMutable<LoDTensor>();
   tensor->mutable_data(place,
-                       framework::TransToPtenDataType(proto::VarType::FP32), 1);
+                       framework::TransToPhiDataType(proto::VarType::FP32), 1);
 }
 
 void PrepareGraph(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog) {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index ea335e9bd63c624310df2f092b13e30a9458bb93..0a95444f852dd0abdd150d04dc7536e26151c218 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index a9e0b9c98b46f39b98a6bdce1fc12bbc3321ef00..56f9e6842373b3eba7d2d71b84adbf17ad291254 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -447,7 +447,7 @@ void MergeLoDTensor(LoDTensor *target,
   target->set_layout(new_layout);
   target->set_lod(new_lod);
   target->mutable_data(dst_place,
-                       paddle::framework::TransToPtenDataType(new_type));
+                       paddle::framework::TransToPhiDataType(new_type));
 
   int begin = 0;
   for (auto *src : lod_tensors) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 766a3b9e495d521db3d628d170fb13fa32bdebb2..878b845211ca1ae9e92f43fcc6ac82da366264d4 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -389,7 +389,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
   {
     platform::RecordEvent infershape_event(
-        "InferShape", platform::TracerEventType::OperatorInner, 1,
+        "infer_shape", platform::TracerEventType::OperatorInner, 1,
         platform::EventRole::kInnerOp);
     // If it is OperatorBase, InferShape do nothing.
     if (op_with_kernel != nullptr)
@@ -411,23 +411,23 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   }
   {
     platform::RecordEvent compute_event(
-        "Compute", platform::TracerEventType::OperatorInner, 1,
+        "compute", platform::TracerEventType::OperatorInner, 1,
         platform::EventRole::kInnerOp);
     if (op_with_kernel == nullptr) {
       instr_node.OpBase()->Run(*local_scope, place_);
     } else {
-      // fit for pten
-      if (instr_node.PtenKernel() && instr_node.PtenKernel()->IsValid()) {
-        VLOG(4) << "Run pten kernel: " << op->Type();
+      // fit for phi
+      if (instr_node.PhiKernel() && instr_node.PhiKernel()->IsValid()) {
+        VLOG(4) << "Run phi kernel: " << op->Type();
         VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                 << &instr_node.DeviceContext();
         phi::KernelContext pt_kernel_context;
-        op_with_kernel->BuildPtenKernelContext(
+        op_with_kernel->BuildPhiKernelContext(
             *instr_node.InnerRuntimeContext().get(),
             const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
             &pt_kernel_context);
 
-        (*instr_node.PtenKernel())(&pt_kernel_context);
+        (*instr_node.PhiKernel())(&pt_kernel_context);
 
       } else {
         instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
@@ -561,7 +561,8 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
             << " runs on " << platform::GetCurrentThreadName();
 
     auto* op = instr_node.OpBase();
-    platform::RecordEvent instruction_event(op->Type().c_str());
+    platform::RecordEvent instruction_event(
+        op->Type(), platform::TracerEventType::Operator, 1);
     interpreter::WaitEvent(instr_node, place_);
 
     try {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 0767dde4392b89d57539ad697f5b64d2090b0fcd..d595af58257d4f6e0f6bd1fd009ab78e181f96f7 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -407,14 +407,14 @@ void build_op_func_list(const platform::Place& place,
       auto exec_ctx =
           ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
 
-      auto run_pten_kernel = false;
-      if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(
+      auto run_phi_kernel = false;
+      if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
               op_with_kernel->Type())) {
-        auto pt_kernel_key = op_with_kernel->ChoosePtenKernel(exec_ctx);
-        auto pt_kernel_name = op_with_kernel->PtenKernelSignature()->name;
+        auto pt_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx);
+        auto pt_kernel_name = op_with_kernel->PhiKernelSignature()->name;
 
-        if (op_with_kernel->PtenKernel()->IsValid()) {
-          run_pten_kernel = true;
+        if (op_with_kernel->PhiKernel()->IsValid()) {
+          run_phi_kernel = true;
         } else {
           auto kernels_iter = all_op_kernels.find(op_with_kernel->Type());
           if (kernels_iter == all_op_kernels.end() ||
@@ -422,26 +422,26 @@ void build_op_func_list(const platform::Place& place,
                   kernels_iter->second.end()) {
             auto pt_cpu_kernel_key = FallBackToCpu(
                 expected_kernel_key, pt_kernel_key, *op_with_kernel);
-            op_with_kernel->ResetPtenKernel(
+            op_with_kernel->ResetPhiKernel(
                 new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
                     pt_kernel_name, pt_cpu_kernel_key)));
-            if (op_with_kernel->PtenKernel()->IsValid()) {
+            if (op_with_kernel->PhiKernel()->IsValid()) {
               VLOG(6) << "Static mode PrepareImpl - kernel name: "
                       << pt_kernel_name
                       << " | kernel key: " << pt_cpu_kernel_key
-                      << " | kernel: " << *(op_with_kernel->PtenKernel());
-              run_pten_kernel = true;
+                      << " | kernel: " << *(op_with_kernel->PhiKernel());
+              run_phi_kernel = true;
             }
           }
         }
       }
       VLOG(3) << op_with_kernel->Type()
               << " : expected_kernel_key : " << expected_kernel_key;
-      if (run_pten_kernel) {
+      if (run_phi_kernel) {
         phi::KernelContext pt_kernel_context;
-        op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx,
-                                               &pt_kernel_context);
-        op_func_node.pt_kernel_ = op_with_kernel->PtenKernel();
+        op_with_kernel->BuildPhiKernelContext(runtime_context, dev_ctx,
+                                              &pt_kernel_context);
+        op_func_node.pt_kernel_ = op_with_kernel->PhiKernel();
 
         (*op_func_node.pt_kernel_)(&pt_kernel_context);
       } else {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 1fbe4500ac6dff261cc38e33ad90bfd92b83ad39..35bac4393170331486298a29f1b6be26065ad864 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -688,9 +688,7 @@ OpKernelComputeFunc Instruction::KernelFunc() const {
   return op_func_node_.kernel_func_;
 }
 
-phi::Kernel* Instruction::PtenKernel() const {
-  return op_func_node_.pt_kernel_;
-}
+phi::Kernel* Instruction::PhiKernel() const { return op_func_node_.pt_kernel_; }
 
 OpFuncType Instruction::KernelType() const { return op_func_node_.type_; }
 
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 93b9aee4f32cbfa88c0a79d4018b3a2ca03cf035..dc34bd2c69411837b6130b87dba1753687cf82f8 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -300,7 +300,7 @@ struct OpFuncNode {
   OpKernelComputeFunc kernel_func_;
   platform::DeviceContext* dev_ctx_;  // not owned
 
-  // fit for pten kernel
+  // fit for phi kernel
   phi::Kernel* pt_kernel_{nullptr};  // not owned
 
   OpFuncType type_;
@@ -321,7 +321,7 @@ class Instruction {
 
   OpKernelComputeFunc KernelFunc() const;
 
-  phi::Kernel* PtenKernel() const;
+  phi::Kernel* PhiKernel() const;
 
   OpFuncType KernelType() const;
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
index a4a913cdff22db18e467670be9644ed90dca542e..21b2927b52eab653e20611e135a8c0f905057fcf 100644
--- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
+++ b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
@@ -44,7 +44,6 @@ class ThreadDataRegistry {
   template <typename Alias = T,
             typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
   void SetCurrentThreadData(const T& val) {
-    std::lock_guard<std::mutex> lock(lock_);
     CurrentThreadData() = val;
   }
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
index 07c5298c2f22377e277939e11af6fa6c142f24bc..596ffb9bfc0c4f624aeaf5874bdf18563d96d14c 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -8,6 +8,7 @@
 #include "paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace framework {
@@ -61,6 +62,8 @@ class WorkQueueImpl : public WorkQueue {
   }
 
   void AddTask(std::function<void()> fn) override {
+    platform::RecordEvent("WorkQueue::AddTask",
+                          platform::TracerEventType::UserDefined, 10 /*level*/);
     if (tracker_ != nullptr) {
       fn = [
         task = std::move(fn), raii = CounterGuard<TaskTracker>(tracker_)
@@ -156,6 +159,8 @@ WorkQueueGroupImpl::~WorkQueueGroupImpl() {
 }
 
 void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
+  platform::RecordEvent("WorkQueue::AddTask",
+                        platform::TracerEventType::UserDefined, 10 /*level*/);
   assert(queue_idx < queues_.size());
   if (queues_options_.at(queue_idx).track_task) {
     fn = [
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 692ebf6f332f15be552a223cab89eabbf5c4a69b..d33791f70c4d2f759bcd4f6443a5a1f244673d4f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_call_stack.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
@@ -263,11 +263,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
       // in order to record different op type cost time
       // and different op name cost time,we set two event.
       platform::RecordEvent op_type_record_event(
-          Type().c_str(), platform::TracerEventType::Operator, 1);
-      auto op_name = platform::OpName(outputs_, Type());
-      platform::RecordEvent op_name_record_event(
-          op_name, platform::TracerEventType::Operator, 1,
-          platform::EventRole::kUniqueOp);
+          Type(), platform::TracerEventType::Operator, 1);
+      // auto op_name = platform::OpName(outputs_, Type());
+      // platform::RecordEvent op_name_record_event(
+      //     op_name, platform::TracerEventType::Operator, 1,
+      //     platform::EventRole::kUniqueOp);
       RunImpl(scope, place);
     }
 
@@ -616,9 +616,9 @@ bool OpSupportGPU(const std::string& op_type) {
   // check in new Function kernel first
   auto& kernel_factory = phi::KernelFactory::Instance();
   auto kernel_key_map =
-      kernel_factory.SelectKernelMap(phi::TransToPtenKernelName(op_type));
+      kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type));
   for (auto& kernel : kernel_key_map) {
-    if (platform::is_gpu_place(phi::TransToPtenPlace(kernel.first.backend()))) {
+    if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) {
       return true;
     }
   }
@@ -1186,10 +1186,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // phase
   phi::KernelKey pt_kernel_key;
   std::string pt_kernel_name;
-  if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(type_)) {
+  if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(type_)) {
     if (pt_kernel_signature_ == nullptr || pt_kernel_ == nullptr) {
       pt_kernel_signature_.reset(
-          new KernelSignature(std::move(GetExpectedPtenKernelArgs(exe_ctx))));
+          new KernelSignature(std::move(GetExpectedPhiKernelArgs(exe_ctx))));
       VLOG(6) << *pt_kernel_signature_.get();
 
       kernel_type_.reset(
@@ -1197,17 +1197,17 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       dev_ctx = pool.Get(kernel_type_->place_);
 
       pt_kernel_name = pt_kernel_signature_->name;
-      pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get());
+      pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
       pt_kernel_.reset(
           new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
               pt_kernel_name, pt_kernel_key)));
 
       if (pt_kernel_->IsValid()) {
-        VLOG(6) << "Static mode ChoosePtenKernel - kernel name: "
+        VLOG(6) << "Static mode ChoosePhiKernel - kernel name: "
                 << pt_kernel_name << " | kernel key: " << pt_kernel_key
                 << " | kernel: " << *pt_kernel_;
       } else {
-        VLOG(6) << "Static mode ChoosePtenKernel - kernel `" << pt_kernel_name
+        VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name
                 << "` not found.";
       }
     }
@@ -1222,7 +1222,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         && !is_xpu_unsupport
 #endif
         ) {
-      run_pten_kernel_ = true;
+      run_phi_kernel_ = true;
     } else {
       auto& all_op_kernels = AllOpKernels();
       auto kernels_iter = all_op_kernels.find(type_);
@@ -1244,12 +1244,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
           VLOG(6) << "Static mode PrepareImpl - kernel name: " << pt_kernel_name
                   << " | kernel key: " << pt_cpu_kernel_key
                   << " | kernel: " << *pt_kernel_;
-          run_pten_kernel_ = true;
+          run_phi_kernel_ = true;
         }
       }
     }
   }
-  if (!run_pten_kernel_) {
+  if (!run_phi_kernel_) {
     if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
       ChooseKernel(exe_ctx);
       dev_ctx = pool.Get(kernel_type_->place_);
@@ -1290,13 +1290,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("compute",
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
-    if (run_pten_kernel_) {
+    if (run_phi_kernel_) {
       phi::KernelContext pt_kernel_context;
       // Do data transform before building KernelContext
       // TODO(zhiqiu): support TransferInplaceVarsBack
-      PreparePtenData(exec_scope, *pt_kernel_, *pt_kernel_signature_,
-                      runtime_ctx);
-      BuildPtenKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
+      PreparePhiData(exec_scope, *pt_kernel_, *pt_kernel_signature_,
+                     runtime_ctx);
+      BuildPhiKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
       (*pt_kernel_)(&pt_kernel_context);
     } else {
       (*kernel_func_)(
@@ -1388,26 +1388,26 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
   return expected_kernel_key;
 }
 
-phi::KernelKey OperatorWithKernel::ChoosePtenKernel(
+phi::KernelKey OperatorWithKernel::ChoosePhiKernel(
     const ExecutionContext& ctx) const {
   pt_kernel_signature_.reset(
-      new KernelSignature(std::move(GetExpectedPtenKernelArgs(ctx))));
+      new KernelSignature(std::move(GetExpectedPhiKernelArgs(ctx))));
   VLOG(6) << *pt_kernel_signature_.get();
 
   kernel_type_.reset(
       new OpKernelType(std::move(InnerGetExpectedKernelType(ctx))));
 
   auto pt_kernel_name = pt_kernel_signature_->name;
-  auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get());
+  auto pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
   pt_kernel_.reset(new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
       pt_kernel_name, pt_kernel_key)));
 
   if (pt_kernel_->IsValid()) {
-    VLOG(6) << "Static mode ChoosePtenKernel - kernel name: " << pt_kernel_name
+    VLOG(6) << "Static mode ChoosePhiKernel - kernel name: " << pt_kernel_name
             << " | kernel key: " << pt_kernel_key
             << " | kernel: " << *pt_kernel_;
   } else {
-    VLOG(6) << "Static mode ChoosePtenKernel - kernel `" << pt_kernel_name
+    VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name
             << "` not found.";
   }
   return pt_kernel_key;
@@ -1918,7 +1918,7 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
                       tensor.layout());
 }
 
-KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
+KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
     const ExecutionContext& ctx) const {
   InitDefaultKernelSignatureMap();
   ExecutionArgumentMappingContext arg_mapping_ctx(ctx);
@@ -1926,7 +1926,7 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
       arg_mapping_ctx);
 }
 
-Scope* OperatorWithKernel::PreparePtenData(
+Scope* OperatorWithKernel::PreparePhiData(
     const Scope& scope, const phi::Kernel& pt_kernel,
     const KernelSignature& pt_kernel_signature, RuntimeContext* ctx) const {
   auto& input_names = std::get<0>(pt_kernel_signature.args);
@@ -1981,12 +1981,12 @@ Scope* OperatorWithKernel::PreparePtenData(
       if (in_def.backend == phi::Backend::ALL_BACKEND) {
         continue;
       }
-      auto expected_place = phi::TransToPtenPlace(in_def.backend);
+      auto expected_place = phi::TransToPhiPlace(in_def.backend);
       if (platform::is_same_place(tensor_in->place(), expected_place)) {
         continue;
       }
 
-      VLOG(3) << "PTen Transform Variable " << input_names[i] << " from "
+      VLOG(3) << "phi Transform Variable " << input_names[i] << " from "
               << tensor_in->place() << " to " << expected_place;
 
       if (!new_scope) {
@@ -2007,7 +2007,7 @@ Scope* OperatorWithKernel::PreparePtenData(
   return new_scope;
 }
 
-void OperatorWithKernel::BuildPtenKernelContext(
+void OperatorWithKernel::BuildPhiKernelContext(
     const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
     phi::KernelContext* pt_kernel_context) const {
   pt_kernel_context->SetDeviceContext(dev_ctx);
@@ -2111,7 +2111,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
       experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
                                                       output_defs.at(i));
       SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPtenPlace(output_defs.at(i).backend));
+          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
 
       pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
@@ -2145,10 +2145,10 @@ void OperatorWithKernel::BuildPtenKernelContext(
         auto& ins_vector = ctx.inputs.at(attr_names[i]);
         if (ins_vector.size() == 1) {  // ShapeTensor
           pt_kernel_context->EmplaceBackAttr(std::move(
-              experimental::MakePtenScalarArrayFromVar(*ins_vector.front())));
+              experimental::MakePhiScalarArrayFromVar(*ins_vector.front())));
         } else {  // ShapeTensorList
           pt_kernel_context->EmplaceBackAttr(std::move(
-              experimental::MakePtenScalarArrayFromVarList(ins_vector)));
+              experimental::MakePhiScalarArrayFromVarList(ins_vector)));
         }
       }
     } else if (attr_defs[i].type_index ==
@@ -2178,8 +2178,8 @@ void OperatorWithKernel::BuildPtenKernelContext(
         }
       } else {
         auto& ins_vector = ctx.inputs.at(attr_names[i]);
-        pt_kernel_context->EmplaceBackAttr(std::move(
-            experimental::MakePtenScalarFromVar(*ins_vector.front())));
+        pt_kernel_context->EmplaceBackAttr(
+            std::move(experimental::MakePhiScalarFromVar(*ins_vector.front())));
       }
 
     } else {
@@ -2198,7 +2198,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
         pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(phi::DataType))) {
-        auto data_type = paddle::framework::TransToPtenDataType(
+        auto data_type = paddle::framework::TransToPhiDataType(
             static_cast<framework::proto::VarType::Type>(
                 BOOST_GET_CONST(int, attr)));
         pt_kernel_context->EmplaceBackAttr(data_type);
@@ -2206,7 +2206,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
             std::type_index(typeid(std::vector<int>))) {
-          // Emplace Back Attr according to the type of Pten_Kernel args.
+          // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index ff9cb8a287a26210cb585c1c58dcb20e860af880..16718a316513e3574e9a7eb14ed50106c8b0dcb6 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -423,7 +423,7 @@ class ExecutionContext {
             "size(%d).",
             allocation_ptr->size(), phi::product(dim) * sizeof(T)));
 
-    paddle::framework::Tensor temp_tensor(framework::TransToPtenDataType(
+    paddle::framework::Tensor temp_tensor(framework::TransToPhiDataType(
         framework::ToDataType(std::type_index(typeid(T)))));
     temp_tensor.Resize(dim);
     temp_tensor.ResetHolder(std::move(shared_allocation));
@@ -538,14 +538,14 @@ class OperatorWithKernel : public OperatorBase {
   }
 
   bool SupportGPU() const override {
-    auto pten_kernels = phi::KernelFactory::Instance().SelectKernelMap(
-        phi::TransToPtenKernelName(type_));
-    auto has_pten_kernel =
-        std::any_of(pten_kernels.begin(), pten_kernels.end(),
+    auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
+        phi::TransToPhiKernelName(type_));
+    auto has_phi_kernel =
+        std::any_of(phi_kernels.begin(), phi_kernels.end(),
                     [](phi::KernelKeyMap::const_reference kern_pair) {
                       return kern_pair.first.backend() == phi::Backend::GPU;
                     });
-    if (has_pten_kernel) {
+    if (has_phi_kernel) {
       return true;
     } else {
       auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
@@ -558,7 +558,7 @@ class OperatorWithKernel : public OperatorBase {
   }
 
   bool SupportNPU() const override {
-    // TODO(zhiqiu): support pten if needed?
+    // TODO(zhiqiu): support phi if needed?
     auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
     return std::any_of(op_kernels.begin(), op_kernels.end(),
                        [](OpKernelMap::const_reference kern_pair) {
@@ -566,7 +566,7 @@ class OperatorWithKernel : public OperatorBase {
                        });
   }
   bool SupportMLU() const override {
-    // TODO(zhiqiu): support pten if needed?
+    // TODO(zhiqiu): support phi if needed?
     auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
     return std::any_of(op_kernels.begin(), op_kernels.end(),
                        [](OpKernelMap::const_reference kern_pair) {
@@ -603,39 +603,39 @@ class OperatorWithKernel : public OperatorBase {
     return kernel_type_->place_;
   }
 
-  /* member functions for adapting to pten lib */
+  /* member functions for adapting to phi lib */
   /** In the Tensor calculation library, the new Kernel adopts a clearer and
     * more streamlined design. The arguments of the Kernel and the input and
     * output arguments registered in the original OpMaker do not match in some
     * cases, so we use map to record the arguments required by the kernel.
     * When selecting Kernel during Op execution, select the arguments of the
-    * original Op according to the GetExpectedPtenKernelArgs returned arguments.
+    * original Op according to the GetExpectedPhiKernelArgs returned arguments.
     */
-  phi::KernelSignature GetExpectedPtenKernelArgs(
+  phi::KernelSignature GetExpectedPhiKernelArgs(
       const ExecutionContext& ctx) const;
 
-  /* member functions for adapting to pten lib */
-  phi::KernelKey ChoosePtenKernel(const ExecutionContext& ctx) const;
+  /* member functions for adapting to phi lib */
+  phi::KernelKey ChoosePhiKernel(const ExecutionContext& ctx) const;
 
   /**
-   * Transfer data place for pten kernel
+   * Transfer data place for phi kernel
    * Is this really needed?
    */
-  Scope* PreparePtenData(const Scope& scope, const phi::Kernel& pt_kernel,
-                         const phi::KernelSignature& pt_kernel_signature,
-                         RuntimeContext* ctx) const;
+  Scope* PreparePhiData(const Scope& scope, const phi::Kernel& pt_kernel,
+                        const phi::KernelSignature& pt_kernel_signature,
+                        RuntimeContext* ctx) const;
 
-  void BuildPtenKernelContext(const RuntimeContext& ctx,
-                              platform::DeviceContext* dev_ctx,
-                              phi::KernelContext* pt_kernel_context) const;
+  void BuildPhiKernelContext(const RuntimeContext& ctx,
+                             platform::DeviceContext* dev_ctx,
+                             phi::KernelContext* pt_kernel_context) const;
 
-  phi::KernelSignature* PtenKernelSignature() const {
+  phi::KernelSignature* PhiKernelSignature() const {
     return pt_kernel_signature_.get();
   }
 
-  phi::Kernel* PtenKernel() const { return pt_kernel_.get(); }
+  phi::Kernel* PhiKernel() const { return pt_kernel_.get(); }
 
-  void ResetPtenKernel(phi::Kernel* kernel) const {
+  void ResetPhiKernel(phi::Kernel* kernel) const {
     return pt_kernel_.reset(kernel);
   }
 
@@ -692,9 +692,9 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::mutex cache_update_mutex_;
   mutable bool enable_cache_transfer_scope_ = false;
   // NOTE(chenweihang): Similar op members are used to adapt to
-  // new pten kernel, if there is a better design in the future,
+  // new phi kernel, if there is a better design in the future,
   // we may polish the implementation here
-  mutable bool run_pten_kernel_ = false;
+  mutable bool run_phi_kernel_ = false;
   mutable bool run_kp_kernel = false;
   mutable std::unique_ptr<phi::KernelSignature> pt_kernel_signature_;
   mutable std::unique_ptr<phi::Kernel> pt_kernel_;
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index d55950064a4a2363222929ea8d4f863575dcd6da..6e55727c8bf67c18a0b27454eaa3c3f48ee9db89 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -44,11 +44,6 @@ DECLARE_string(deny_cinn_ops);
 
 namespace paddle {
 namespace framework {
-
-namespace ir {
-class MemOptVarInfo;
-}  // namespace ir
-
 namespace paddle2cinn {
 
 using framework::ir::Graph;
@@ -398,9 +393,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
       kNoNeedBufferFeeds, no_need_buffer_feeds.release());
   // initialize empty map for kMemOptVarInfoFromMainGraph attribute,
   // it will be filled on the share_mem_opt_info_to_subgraph pass
-  subgraph->GetOrInit<std::unordered_map<
-      std::string, std::shared_ptr<framework::ir::MemOptVarInfo>>>(
-      kMemOptVarInfoFromMainGraph);
+  subgraph->GetOrInit<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph);
   return subgraph;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
index 8cb920831cc543a073652051c1ba234e974179c3..a902eacde820fac8556c42b5b4ccbb6342c7bba8 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
@@ -18,6 +18,10 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+
 namespace paddle2cinn {
 
 constexpr char kCinnLaunchOp[] = "cinn_launch";
@@ -27,6 +31,9 @@ constexpr char kInternalVars[] = "InternalVars";
 constexpr char kOutputVars[] = "OutputVars";
 constexpr char kMemOptVarInfoFromMainGraph[] =
     "mem_opt_var_info_from_main_graph";
+using Name2VarInfoMap =
+    std::unordered_map<std::string,
+                       std::shared_ptr<framework::ir::MemOptVarInfo>>;
 
 // A pass named BuildCinnPass, the function of this pass is:
 //
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index 919fc60d4cb61b6079965e3c8ab7d43ca9a2b211..bf9d1baaf394f05d125563311dd2047383373834 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -255,7 +255,9 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) {
   ASSERT_EQ(
       std::unordered_set<Node*>(cinn_op->inputs.begin(), cinn_op->inputs.end()),
       std::unordered_set<Node*>({v0, v1, v2, v4}));
-  ASSERT_EQ(cinn_op->outputs, std::vector<Node*>({v6, v7}));
+  ASSERT_EQ(std::unordered_set<Node*>(cinn_op->outputs.begin(),
+                                      cinn_op->outputs.end()),
+            std::unordered_set<Node*>({v6, v7}));
   ASSERT_EQ(v1->outputs, std::vector<Node*>({cinn_op}));
   ASSERT_EQ(v6->inputs, std::vector<Node*>({cinn_op}));
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 716cd85e7117af4680f6cad908810ebdf6f5f973..706815185a1b5b53d1bb8e26274206abc126cfd5 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -248,10 +248,10 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   *compiled_obj = {std::move(graph_compiler),
                    std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
-  compiled_obj->launch_context =
-      std::make_unique<operators::details::CinnLaunchContext>(
-          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
   compiled_obj->cached_index = compiled_num;
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(graph,
+                                                              *compiled_obj);
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
index 09bca4a735461914e203cd479f45d000985a37b4..c0e1ca8f0d123379f3363afc45dd083b4a5dc951 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -209,7 +209,7 @@ class CinnGraphSymbolizationTest : public ::testing::Test {
       tensor.Resize(dims);
       tensor.mutable_data(
           platform::CPUPlace(),
-          framework::TransToPtenDataType(framework::proto::VarType::FP32));
+          framework::TransToPhiDataType(framework::proto::VarType::FP32));
       return tensor;
     };
 #define FillFeedList(Name) feed_targets[#Name] = create_tensor();
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/phi_utils.cc
similarity index 84%
rename from paddle/fluid/framework/pten_utils.cc
rename to paddle/fluid/framework/phi_utils.cc
index 0ecc04dbd6b8d36a3540178d11d3e8def7449a7f..355291beb60f949b52b681592d42b7da4e80186b 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <sstream>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -57,17 +57,16 @@ class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker {
   paddle::SmallVector<std::string> attr_names_;
 };
 
-OpKernelType TransPtenKernelKeyToOpKernelType(
-    const phi::KernelKey& kernel_key) {
+OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
   proto::VarType::Type data_type =
       paddle::framework::TransToProtoVarType(kernel_key.dtype());
   // no need to set current device id here
-  platform::Place place = phi::TransToPtenPlace(kernel_key.backend(), false);
+  platform::Place place = phi::TransToPhiPlace(kernel_key.backend(), false);
   DataLayout data_layout = kernel_key.layout();
   LibraryType library_type = LibraryType::kPlain;
   if (kernel_key.backend() == phi::Backend::MKLDNN) {
     library_type = LibraryType::kMKLDNN;
-  } else if (kernel_key.backend() == phi::Backend::CUDNN) {
+  } else if (kernel_key.backend() == phi::Backend::GPUDNN) {
     library_type = LibraryType::kCUDNN;
   } else {
     // do nothing
@@ -76,19 +75,19 @@ OpKernelType TransPtenKernelKeyToOpKernelType(
   return OpKernelType(data_type, place, data_layout, library_type);
 }
 
-phi::KernelKey TransOpKernelTypeToPtenKernelKey(
+phi::KernelKey TransOpKernelTypeToPhiKernelKey(
     const OpKernelType& kernel_type) {
-  phi::Backend backend = phi::TransToPtenBackend(kernel_type.place_);
+  phi::Backend backend = phi::TransToPhiBackend(kernel_type.place_);
   if (kernel_type.library_type_ == LibraryType::kMKLDNN) {
     backend = phi::Backend::MKLDNN;
   } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
-    backend = phi::Backend::CUDNN;
+    backend = phi::Backend::GPUDNN;
   } else {
     // do
   }
   paddle::experimental::DataLayout layout = kernel_type.data_layout_;
   paddle::experimental::DataType dtype =
-      paddle::framework::TransToPtenDataType(kernel_type.data_type_);
+      paddle::framework::TransToPhiDataType(kernel_type.data_type_);
   return phi::KernelKey(backend, layout, dtype);
 }
 
@@ -98,8 +97,8 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
 #ifdef PADDLE_WITH_XPU
   if (platform::is_xpu_place(expected_kernel_key.place_) ||
       paddle::platform::is_in_xpu_black_list(op.Type())) {
-    VLOG(3) << "pten missing XPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
+    VLOG(3) << "phi missing XPU kernel: " << op.Type()
+            << "phipected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
                           kernel_key.dtype());
@@ -107,8 +106,8 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   if (platform::is_npu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "pten missing NPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
+    VLOG(3) << "phi missing NPU kernel: " << op.Type()
+            << "phipected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
                           kernel_key.dtype());
@@ -116,8 +115,8 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
 #endif
 #ifdef PADDLE_WITH_MLU
   if (platform::is_mlu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "pten missing MLU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
+    VLOG(3) << "phi missing MLU kernel: " << op.Type()
+            << "phipected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(),
                           kernel_key.dtype());
@@ -132,17 +131,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
     auto& in = op_proto_->inputs()[i];
     auto& in_name = in.name();
     if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(6) << "Parse PtenKernel input: skip extra & quant input - "
+      VLOG(6) << "Parse PhiKernel input: skip extra & quant input - "
               << in_name;
       continue;
     }
     // If contains dispensable input, we should override the
     // OpArgumentMapping method self in phi/ops/compat dir
     if (in.has_dispensable() && in.dispensable()) {
-      VLOG(6) << "Parse PtenKernel input: skip dispensable input - " << in_name;
+      VLOG(6) << "Parse PhiKernel input: skip dispensable input - " << in_name;
       continue;
     }
-    VLOG(6) << "Parse PtenKernel input: " << in_name;
+    VLOG(6) << "Parse PhiKernel input: " << in_name;
     input_names_.emplace_back(in_name);
   }
   return input_names_;
@@ -154,11 +153,11 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
     auto& out = op_proto_->outputs()[i];
     auto& out_name = out.name();
     if ((out.has_extra() && out.extra()) || (out.has_quant() && out.quant())) {
-      VLOG(6) << "Parse PtenKernel output: skip extra & quant output - "
+      VLOG(6) << "Parse PhiKernel output: skip extra & quant output - "
               << out_name;
       continue;
     }
-    VLOG(6) << "Parse PtenKernel output: " << out_name;
+    VLOG(6) << "Parse PhiKernel output: " << out_name;
     output_names_.emplace_back(out_name);
   }
   return output_names_;
@@ -173,17 +172,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
         attr_name == "op_role" || attr_name == "op_role_var" ||
         attr_name == "op_namescope" || attr_name == "op_callstack" ||
         attr_name == "op_device") {
-      VLOG(6) << "Parse PtenKernel attribute: skip needless attr - "
+      VLOG(6) << "Parse PhiKernel attribute: skip needless attr - "
               << attr_name;
       continue;
     }
     if ((attr.has_extra() && attr.extra()) ||
         (attr.has_quant() && attr.quant())) {
-      VLOG(6) << "Parse PtenKernel attribute: skip extra & quant attr - "
+      VLOG(6) << "Parse PhiKernel attribute: skip extra & quant attr - "
               << attr_name;
       continue;
     }
-    VLOG(6) << "Parse PtenKernel attribute: " << attr_name;
+    VLOG(6) << "Parse PhiKernel attribute: " << attr_name;
     attr_names_.emplace_back(attr_name);
   }
 
@@ -191,7 +190,7 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }
 
 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(phi::TransToPtenKernelName(op_proto_->type()),
+  return KernelSignature(phi::TransToPhiKernelName(op_proto_->type()),
                          GetInputArgsNames(), GetAttrsArgsNames(),
                          GetOutputArgsNames());
 }
@@ -203,7 +202,7 @@ void InitDefaultKernelSignatureMap() {
     for (const auto& pair : paddle::framework::OpInfoMap::Instance().map()) {
       const auto& op_type = pair.first;
       const auto* op_proto = pair.second.proto_;
-      if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) &&
+      if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type) &&
           op_proto) {
         paddle::framework::KernelArgsNameMakerByOpProto maker(op_proto);
         VLOG(10) << "Register kernel signature for " << op_type;
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/phi_utils.h
similarity index 87%
rename from paddle/fluid/framework/pten_utils.h
rename to paddle/fluid/framework/phi_utils.h
index 1bcffbcc3143547eb1df0975c9e2163bfebed02e..1a1f79d82770058ae4010b7a3a3162280ceb1537 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -44,9 +44,8 @@ using KernelSignature = phi::KernelSignature;
 
 /* Kernel Key translate */
 
-OpKernelType TransPtenKernelKeyToOpKernelType(const phi::KernelKey& kernel_key);
-phi::KernelKey TransOpKernelTypeToPtenKernelKey(
-    const OpKernelType& kernel_type);
+OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key);
+phi::KernelKey TransOpKernelTypeToPhiKernelKey(const OpKernelType& kernel_type);
 phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
                              const phi::KernelKey& kernel_key,
                              const framework::OperatorBase& op);
@@ -68,25 +67,25 @@ void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
 
 // TODO(Wilber): support others device context.
 template <typename T>
-struct ConvertToPtenContext {
+struct ConvertToPhiContext {
   using TYPE = T;
 };
 
 template <>
-struct ConvertToPtenContext<platform::CPUDeviceContext> {
+struct ConvertToPhiContext<platform::CPUDeviceContext> {
   using TYPE = phi::CPUContext;
 };
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
-struct ConvertToPtenContext<platform::CUDADeviceContext> {
+struct ConvertToPhiContext<platform::CUDADeviceContext> {
   using TYPE = phi::GPUContext;
 };
 #endif
 
 #ifdef PADDLE_WITH_XPU
 template <>
-struct ConvertToPtenContext<platform::XPUDeviceContext> {
+struct ConvertToPhiContext<platform::XPUDeviceContext> {
   using TYPE = phi::XPUContext;
 };
 #endif
diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc
similarity index 52%
rename from paddle/fluid/framework/pten_utils_test.cc
rename to paddle/fluid/framework/phi_utils_test.cc
index 3c86372e6e7528908a51b83b611da53cd68cff79..cbcdf24c9f32b47f3337b4f176753328497d8c85 100644
--- a/paddle/fluid/framework/pten_utils_test.cc
+++ b/paddle/fluid/framework/phi_utils_test.cc
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
-TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
+TEST(PhiUtils, TransPhiKernelKeyToOpKernelType) {
   phi::KernelKey kernel_key(phi::Backend::CPU, phi::DataLayout::NCHW,
                             phi::DataType::FLOAT32);
   auto op_kernel_type =
-      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key);
+      paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key);
   ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
   ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
   ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_));
@@ -33,7 +33,7 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
   phi::KernelKey kernel_key_mkldnn(phi::Backend::MKLDNN, phi::DataLayout::NCHW,
                                    phi::DataType::FLOAT32);
   op_kernel_type =
-      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_mkldnn);
+      paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_mkldnn);
   ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
   ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
   ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_));
@@ -42,10 +42,10 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
 #endif
 
 #ifdef PADDLE_WITH_CUDA
-  phi::KernelKey kernel_key_cudnn(phi::Backend::CUDNN, phi::DataLayout::NCHW,
+  phi::KernelKey kernel_key_cudnn(phi::Backend::GPUDNN, phi::DataLayout::NCHW,
                                   phi::DataType::FLOAT32);
   op_kernel_type =
-      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_cudnn);
+      paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_cudnn);
   ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
   ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
   ASSERT_TRUE(paddle::platform::is_gpu_place(op_kernel_type.place_));
@@ -53,3 +53,38 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
             paddle::framework::LibraryType::kCUDNN);
 #endif
 }
+
+TEST(PhiUtils, TransOpKernelTypeToPhiKernelKey) {
+  paddle::framework::OpKernelType op_kernel_type(
+      paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(),
+      paddle::framework::DataLayout::kNCHW);
+  auto kernel_key =
+      paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type);
+  ASSERT_EQ(kernel_key.dtype(), phi::DataType::FLOAT32);
+  ASSERT_EQ(kernel_key.layout(), phi::DataLayout::NCHW);
+  ASSERT_EQ(kernel_key.backend(), phi::Backend::CPU);
+
+#ifdef PADDLE_WITH_MKLDNN
+  paddle::framework::OpKernelType op_kernel_type_mkldnn(
+      paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(),
+      paddle::framework::DataLayout::kMKLDNN,
+      paddle::framework::LibraryType::kMKLDNN);
+  auto kernel_key_mkldnn =
+      paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_mkldnn);
+  ASSERT_EQ(kernel_key_mkldnn.dtype(), phi::DataType::FLOAT32);
+  ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::MKLDNN);
+  ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::MKLDNN);
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::OpKernelType op_kernel_type_cudnn(
+      paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(),
+      paddle::framework::DataLayout::kNCHW,
+      paddle::framework::LibraryType::kCUDNN);
+  auto kernel_key_cudnn =
+      paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_cudnn);
+  ASSERT_EQ(kernel_key_cudnn.dtype(), phi::DataType::FLOAT32);
+  ASSERT_EQ(kernel_key_cudnn.layout(), phi::DataLayout::NCHW);
+  ASSERT_EQ(kernel_key_cudnn.backend(), phi::Backend::GPUDNN);
+#endif
+}
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 10eefff093b0e867131c91fb0a8132175a28c6be..10ceae62dccbbab9329b73e0f581b51508511194 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1457,7 +1457,7 @@ std::ostream& print_tensor<paddle::platform::complex<double>>(
 std::ostream& operator<<(std::ostream& os, const LoD& lod) {
   // NOTE(xiongkun):
   // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution
-  // if we don't redefine, the operator << of pten / framework LoD is not found.
+  // if we don't redefine, the operator << of phi / framework LoD is not found.
   paddle::string::operator<<(os, lod);
   return os;
 }
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 72f7e5af9a96eea2a6cd09912d2dbcc5f53bd931..f198919b0c87bb4f2ea9991e401a8242676d3f46 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,11 +1,11 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
-cc_library(var_helper SRCS var_helper.cc DEPS tensor pten_api)
+cc_library(var_helper SRCS var_helper.cc DEPS tensor phi_api)
 IF(WITH_XPU)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten_api pten pten_utils var_helper)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi phi_utils var_helper)
 ELSE()
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten_api pten pten_utils var_helper)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi phi_utils var_helper)
 ENDIF()
-cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper pten_api)
+cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper phi_api)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper)
 cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper)
@@ -47,9 +47,9 @@ if(WITH_GLOO)
 endif()
 
 if(NOT WITH_ASCEND_CL)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function pten_tensor)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor)
 else()
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner pten_tensor)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor)
 endif()
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 94c6d0a4d569a1ce458ed3590385de446d0ee150..149202468be6c6bec833f100adfd4100c520f8f3 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -70,12 +70,12 @@ OpSupportedInfos(const std::string& place,
     }
   }
 
-  auto pten_kernels = phi::KernelFactory::Instance().kernels();
-  for (auto& kernel_pair : pten_kernels) {
+  auto phi_kernels = phi::KernelFactory::Instance().kernels();
+  for (auto& kernel_pair : phi_kernels) {
     auto op_type = phi::TransToFluidOpName(kernel_pair.first);
     for (auto& info_pair : kernel_pair.second) {
       framework::OpKernelType kernel_type =
-          framework::TransPtenKernelKeyToOpKernelType(info_pair.first);
+          framework::TransPhiKernelKeyToOpKernelType(info_pair.first);
       if (is_target_place[query_place](kernel_type.place_) &&
           kernel_type.data_type_ == dtype && all_ops.count(op_type)) {
         VLOG(4) << op_type << " " << supported_ops.size();
@@ -273,8 +273,9 @@ static inline std::shared_ptr<VarType> CastToBF16(
 
 template <typename VarType>
 static inline framework::proto::VarType::Type GetPromoteType(
-    const std::string& op_type, const NameVarMap<VarType>& ins) {
-  auto dst_type = framework::proto::VarType::FP16;
+    const std::string& op_type, const NameVarMap<VarType>& ins,
+    const framework::proto::VarType::Type amp_dtype) {
+  auto dst_type = amp_dtype;
   for (const auto& pair : ins) {
     for (const auto& var : pair.second) {
       if (GetDataType<VarType>(var) == framework::proto::VarType::FP32) {
@@ -337,7 +338,8 @@ NameVarMap<VarType> AutoCastInputs(const std::string& op_type,
     }
     return new_ins;
   } else {
-    auto dst_type = GetPromoteType<VarType>(op_type, ins);
+    auto dst_type =
+        GetPromoteType<VarType>(op_type, ins, framework::proto::VarType::FP16);
 
     // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
     if (dst_type == framework::proto::VarType::FP16 &&
@@ -435,7 +437,7 @@ NameVarMap<VarType> AutoCastBF16Inputs(const std::string& op_type,
       }
     }
     return new_ins;
-  } else {
+  } else if (AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
     for (auto& pair : new_ins) {
       VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
               << GetDtypeStr(*pair.second.cbegin()) << " to float";
@@ -444,6 +446,26 @@ NameVarMap<VarType> AutoCastBF16Inputs(const std::string& op_type,
       }
     }
     return new_ins;
+  } else {
+    auto dst_type =
+        GetPromoteType<VarType>(op_type, ins, framework::proto::VarType::BF16);
+    // NOTE(zhangbo): if the op has op fp16 kernel, fall back to fp32.
+    if (dst_type == framework::proto::VarType::BF16 &&
+        AmpOperators::Instance().GetMutableUnsupportedBf16Ops()->count(
+            op_type)) {
+      dst_type = framework::proto::VarType::FP32;
+    }
+    for (auto& pair : new_ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to "
+              << framework::DataTypeToString(dst_type);
+      for (auto& var : pair.second) {
+        var = (dst_type == framework::proto::VarType::FP32
+                   ? CastToFP32<VarType>(var)
+                   : CastToBF16<VarType>(var));
+      }
+    }
+    return new_ins;
   }
   return new_ins;
 }
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 97a188e5c9c2712c2c6d819b7e8f0c5ca0b2a47a..8373c7fe50d0222d6b38a400e82239dc8c3590ad 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -154,7 +154,7 @@ void BasicEngine::CheckBackwardInputs(const OpBase& op) {
         // Here, we use the type of the corresponding forward datatype.
 
         tensor->mutable_data(
-            op.place(), framework::TransToPtenDataType(var->ForwardDataType()));
+            op.place(), framework::TransToPhiDataType(var->ForwardDataType()));
         VLOG(6) << "Set ungenerated Grad: " << var->Name()
                 << " as zero with dtype "
                 << framework::DataTypeToString(var->ForwardDataType());
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 3587736a851da57cab6892593a5087dcdd338622..0abc5ad90e2697eb78ff1e21ceb2bc0e97e14a44 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -791,13 +791,13 @@ void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                 << var->Var().Get<framework::LoDTensor>().dims();
         tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
         tensor->mutable_data(place,
-                             framework::TransToPtenDataType(var->DataType()));
+                             framework::TransToPhiDataType(var->DataType()));
         phi::funcs::set_constant(*dev_ctx, tensor, 0.0);
       } else {
         auto* tensor =
             dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
         tensor->mutable_data(place,
-                             framework::TransToPtenDataType(var->DataType()));
+                             framework::TransToPhiDataType(var->DataType()));
         phi::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
@@ -925,13 +925,13 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                 << var->Var().Get<framework::LoDTensor>().dims();
         tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
         tensor->mutable_data(place,
-                             framework::TransToPtenDataType(var->DataType()));
+                             framework::TransToPhiDataType(var->DataType()));
         phi::funcs::set_constant(*dev_ctx, tensor, 0.0);
       } else {
         auto* tensor =
             dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
         tensor->mutable_data(place,
-                             framework::TransToPtenDataType(var->DataType()));
+                             framework::TransToPhiDataType(var->DataType()));
         phi::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index f1d0c8afdd50e3868423a9906d9955d7aea66983..56ddbf338619890f8a88bdf09a0bb770ec31bb2f 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -314,10 +314,10 @@ static void FillConstantLike(const VariableWrapper &ref_var,
   // default data_type for now.
   if (ref_var.ForwardDataType() != -1) {
     dst_tensor->mutable_data(
-        place, framework::TransToPtenDataType(ref_var.ForwardDataType()));
+        place, framework::TransToPhiDataType(ref_var.ForwardDataType()));
   } else {
-    dst_tensor->mutable_data(
-        place, framework::TransToPtenDataType(ref_var.DataType()));
+    dst_tensor->mutable_data(place,
+                             framework::TransToPhiDataType(ref_var.DataType()));
   }
   phi::funcs::set_constant(*dev_ctx, dst_tensor, value);
 }
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 6d18b0a86f0911f38e1c51d61467bf9a01a6de21..9dd1dacc02c25474803ef3177d9cd967ee681714 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -121,7 +121,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       kernel_type_(kernel_type),
       func_(nullptr),
       dev_ctx_(dev_ctx),
-      run_pten_kernel_(true),
+      run_phi_kernel_(true),
       pt_kernel_signature_(kernel_signature),
       pt_kernel_(pt_kernel) {}
 
@@ -151,7 +151,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
   // NOTE(zhiqiu): for kernels on given device, for example NPU, the order to
   // choose is:
-  // pten npu kernel > fluid npu kernel > pten cpu kernel > fluid cpu kernel
+  // phi npu kernel > fluid npu kernel > phi cpu kernel > fluid cpu kernel
 
   // 1. get expected kernel key
   auto dygraph_exe_ctx = DygraphExecutionContext<VarType>(
@@ -168,12 +168,12 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
                                                expected_kernel_key) ||
       paddle::platform::is_in_xpu_black_list(op.Type());
 #endif
-  if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
-    pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
+  if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) {
+    pt_kernel_signature = op.GetExpectedPhiKernelArgs(dygraph_exe_ctx);
     VLOG(6) << pt_kernel_signature;
 
     pt_kernel_name = pt_kernel_signature.name;
-    pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key);
+    pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key);
     auto pt_kernel = phi::KernelFactory::Instance().SelectKernel(pt_kernel_name,
                                                                  pt_kernel_key);
 
@@ -195,7 +195,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
       return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
                         pt_kernel, dev_ctx);
     } else {
-      VLOG(6) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
+      VLOG(6) << "Dynamic mode ChoosePhiKernel - kernel `" << pt_kernel_name
               << "` not found.";
     }
   }
@@ -211,7 +211,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
       || is_xpu_unsupport
 #endif
       ) {
-    if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
+    if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) {
       auto pt_cpu_kernel_key =
           FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
       auto pt_cpu_kernel = phi::KernelFactory::Instance().SelectKernel(
@@ -423,12 +423,12 @@ static void PreparedOpRunPtImpl(
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
 
-    PreparePtenData<VarType>(pt_kernel, pt_kernel_signature, ins);
+    PreparePhiData<VarType>(pt_kernel, pt_kernel_signature, ins);
 
     phi::KernelContext pt_kernel_context;
-    BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
-                                           outs, attrs, default_attrs, dev_ctx,
-                                           &pt_kernel_context);
+    BuildDygraphPhiKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
+                                          outs, attrs, default_attrs, dev_ctx,
+                                          &pt_kernel_context);
 
     pt_kernel(&pt_kernel_context);
   }
@@ -451,7 +451,7 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const NameVarMap<VarBase>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_pten_kernel_) {
+  if (run_phi_kernel_) {
     PreparedOpRunPtImpl<VarBase>(op_, kernel_type_, pt_kernel_signature_,
                                  pt_kernel_, dev_ctx_, ins, outs, attrs,
                                  default_attrs);
@@ -465,7 +465,7 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const NameVarMap<VariableWrapper>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_pten_kernel_) {
+  if (run_phi_kernel_) {
     PreparedOpRunPtImpl<VariableWrapper>(
         op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins,
         outs, attrs, default_attrs);
@@ -479,7 +479,7 @@ void PreparedOp::Run(const NameVarMap<egr::EagerVariable>& ins,
                      const NameVarMap<egr::EagerVariable>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_pten_kernel_) {
+  if (run_phi_kernel_) {
     PreparedOpRunPtImpl<egr::EagerVariable>(
         op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins,
         outs, attrs, default_attrs);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 879b3ec3e68a25141c239d00e25fab92914ef068..8e1e2fbe9a12da672a633075ed4c41d3d62cd7e1 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -201,9 +201,9 @@ class PreparedOp {
   framework::OperatorWithKernel::OpKernelFunc func_;
   platform::DeviceContext* dev_ctx_;
   // NOTE(chenweihang): Similar op members are used to adapt to
-  // new pten kernel, if there is a better design in the future,
+  // new phi kernel, if there is a better design in the future,
   // we may polish the implementation here
-  bool run_pten_kernel_{false};
+  bool run_phi_kernel_{false};
   bool run_kp_kernel_{false};
   framework::KernelSignature pt_kernel_signature_;
   phi::Kernel pt_kernel_;
@@ -225,7 +225,7 @@ const inline framework::Attribute& GetAttr(
 }
 
 template <typename VarType>
-void BuildDygraphPtenKernelContext(
+void BuildDygraphPhiKernelContext(
     const framework::KernelSignature& pt_kernel_signature,
     const phi::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
@@ -327,7 +327,7 @@ void BuildDygraphPtenKernelContext(
       experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
                                                       output_defs.at(i));
       framework::SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPtenPlace(output_defs.at(i).backend));
+          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
 
       kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
@@ -369,7 +369,7 @@ void BuildDygraphPtenKernelContext(
         auto& ins_vector = ins.at(attr_names[i]);
         if (ins_vector.size() == 1) {  // ShapeTensor
           kernel_ctx->EmplaceBackAttr(std::move(
-              experimental::MakePtenScalarArrayFromVar(ins_vector[0]->Var())));
+              experimental::MakePhiScalarArrayFromVar(ins_vector[0]->Var())));
         } else {  // ShapeTensorList
           std::vector<framework::Variable*> variables;
           variables.reserve(ins_vector.size());
@@ -377,7 +377,7 @@ void BuildDygraphPtenKernelContext(
             variables.push_back(var_base->MutableVar());
           }
           kernel_ctx->EmplaceBackAttr(std::move(
-              experimental::MakePtenScalarArrayFromVarList(variables)));
+              experimental::MakePhiScalarArrayFromVarList(variables)));
         }
       }
     } else if (attr_defs[i].type_index ==
@@ -409,7 +409,7 @@ void BuildDygraphPtenKernelContext(
       } else {  // scalar is in the input
         auto& ins_vector = ins.at(attr_names[i]);
         kernel_ctx->EmplaceBackAttr(std::move(
-            experimental::MakePtenScalarFromVar(ins_vector[0]->Var())));
+            experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
       }
 
     } else {
@@ -428,7 +428,7 @@ void BuildDygraphPtenKernelContext(
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(phi::DataType))) {
-        auto data_type = framework::TransToPtenDataType(
+        auto data_type = framework::TransToPhiDataType(
             static_cast<framework::proto::VarType::Type>(
                 BOOST_GET_CONST(int, attr)));
         kernel_ctx->EmplaceBackAttr(data_type);
@@ -436,7 +436,7 @@ void BuildDygraphPtenKernelContext(
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
             std::type_index(typeid(std::vector<int>))) {
-          // Emplace Back Attr according to the type of Pten_Kernel args.
+          // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
@@ -456,9 +456,9 @@ void BuildDygraphPtenKernelContext(
 }
 
 template <typename VarType>
-void PreparePtenData(const phi::Kernel& pt_kernel,
-                     const framework::KernelSignature& pt_kernel_signature,
-                     const NameVarMap<VarType>& ins) {
+void PreparePhiData(const phi::Kernel& pt_kernel,
+                    const framework::KernelSignature& pt_kernel_signature,
+                    const NameVarMap<VarType>& ins) {
   auto& input_names = std::get<0>(pt_kernel_signature.args);
   auto& input_defs = pt_kernel.args_def().input_defs();
 
@@ -482,12 +482,12 @@ void PreparePtenData(const phi::Kernel& pt_kernel,
         if (in_def.backend == phi::Backend::ALL_BACKEND) {
           continue;
         }
-        auto expected_place = phi::TransToPtenPlace(in_def.backend);
+        auto expected_place = phi::TransToPhiPlace(in_def.backend);
         if (platform::is_same_place(tensor_in->place(), expected_place)) {
           continue;
         }
 
-        VLOG(3) << "Pten Transform Variable " << input_names[i] << " from "
+        VLOG(3) << "Phi Transform Variable " << input_names[i] << " from "
                 << tensor_in->place() << " to " << expected_place;
 
         framework::Tensor tmp_tensor;
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 8681382394b9eea65ddcd8977c96e8a517516edd..3a6365b2af21ae9012fe37293699caed9bb23855 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -446,7 +446,7 @@ void Reducer::InitializeGroups(
       InitializeDenseGroups(variable_indices_, &group);
       auto tensor = group.dense_contents_.GetMutable<framework::LoDTensor>();
       tensor->Resize(phi::make_ddim({group.all_length_}))
-          .mutable_data(place_, framework::TransToPtenDataType(group.dtype_));
+          .mutable_data(place_, framework::TransToPhiDataType(group.dtype_));
     }
 
     // map variables to this group by VariableLocator
@@ -738,7 +738,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
       if (!group_tensor.IsInitialized()) {
         group_tensor.Resize({static_cast<int64_t>(length)});
         group_tensor.mutable_data(place_,
-                                  framework::TransToPtenDataType(group.dtype_));
+                                  framework::TransToPhiDataType(group.dtype_));
       }
 
 #ifdef PADDLE_WITH_XPU_BKCL
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index a9c81cb87798b6e7b68de169e7f40ba1c3ccd367..e4f1cfdb3baeed9b5945b7843b6593528df48c29 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -15,7 +15,7 @@ else()
 endif(WIN32)
 
 
-cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function pten_tensor pten_api pten_api_utils)
+cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function phi_tensor phi_api phi_api_utils)
 cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy)
 cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place)
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index bca7ecc5d17dc814931e3f81a21d67ec43159355..6c304278d21fde7af093b25cdd8f62a1d4528d31 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -96,7 +96,7 @@ void GroupConcatSplit(Place place, size_t size) {
   {  // concat
     auto* tensor = group.dense_contents_.GetMutable<framework::LoDTensor>();
     tensor->Resize(phi::make_ddim({group.all_length_}))
-        .mutable_data(place, framework::TransToPtenDataType(group.dtype_));
+        .mutable_data(place, framework::TransToPhiDataType(group.dtype_));
     group.ConcatTensors(*dev_ctx);
 
     group.DivNRanks(*dev_ctx, 1);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 03811ac778779c24beb765de118f2d7d00af515b..85bcbd1458f24a592b646dfcda750f37f113f73f 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -175,7 +175,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
                      paddle::framework::AttributeMap* passed_default_attrs_,
                      bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
-      type, platform::TracerEventType::Operator, 2);
+      type, platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
@@ -205,17 +205,19 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
 
   NameVarMap<VarType> new_ins = ins;
   if (amp_level_ == AmpLevel::O1) {
-    VLOG(5) << "Auto mixed precision run operator: " << type;
     if (amp_dtype_ == phi::DataType::FLOAT16) {
+      VLOG(5) << "Float16 Auto Mixed Precision O1 run operator: " << type;
       new_ins = AutoCastInputs<VarType>(type, ins);
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
+      VLOG(5) << "BFloat16 Auto Mixed Precision O1 run operator: " << type;
       new_ins = AutoCastBF16Inputs<VarType>(type, ins);
     }
   } else if (amp_level_ == AmpLevel::O2) {
-    VLOG(5) << "Pure fp16 run operator: " << type;
     if (amp_dtype_ == phi::DataType::FLOAT16) {
+      VLOG(5) << "Float16 Auto Mixed Precision O2 run operator: " << type;
       new_ins = CastPureFp16Inputs<VarType>(type, ins);
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
+      VLOG(5) << "BFloat16 Auto Mixed Precision O2 run operator: " << type;
       new_ins = CastPureBf16Inputs<VarType>(type, ins);
     }
   }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 887bd52bae54770c3637e83c86d098a39f9a2e04..26b8b9e8e17e046964d648f564c26293036e4033 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,7 +35,7 @@ endif()
 
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
+get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
 set(utils_modules stringpiece pretty_log string_helper)
 
 add_subdirectory(api)
@@ -47,11 +47,11 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
+  cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
 elseif(WITH_IPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules} paddle_ipu)
+  cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules} paddle_ipu)
 else()
-  create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
+  create_static_lib(paddle_inference ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
 endif()
 
 if(NOT APPLE)
@@ -81,7 +81,7 @@ set(SHARED_INFERENCE_SRCS
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps
-set(SHARED_INFERENCE_DEPS ${fluid_modules} ${pten_modules} analysis_predictor)
+set(SHARED_INFERENCE_DEPS ${fluid_modules} ${phi_modules} analysis_predictor)
 
 if (WITH_CRYPTO) 
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 6c465e62780593c043f844e2738132c404e280b5..87efe5ec5190372b48f1bd6387e1c92f456865a1 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -56,8 +56,10 @@ cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
 
 if(WITH_TESTING)
   if (NOT APPLE AND NOT WIN32)
-    inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared
-      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
+    if (WITH_GPU)
+      inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared
+        ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
+    endif()
   elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 8d3e091dbf5abeff5e32571666e76d50bf91941e..e8e9d895b4e8fb982ccb667352fd6c26228782a5 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/pybind.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 0e4fb3335f3d76eecea85417ac83c205d63ac9c4..eeaa128290339ce8c2ac6961c575d64abaa3c1db 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -198,7 +198,7 @@ void InitDstTensor(framework::LoDTensor* dst,
                    const paddle::lite_api::Tensor& src) {
   dst->mutable_data(
       inference::lite::utils::GetNativePlace(src.target()),
-      framework::TransToPtenDataType(GetNativePrecisionType(src.precision())));
+      framework::TransToPhiDataType(GetNativePrecisionType(src.precision())));
   SetLoD(dst->mutable_lod(), src.lod());
 }
 
@@ -269,7 +269,7 @@ void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
   SetLoD(dst->mutable_lod(), src->lod());
   dst->ResetHolderWithType(
       holder,
-      framework::TransToPtenDataType(GetNativePrecisionType(src->precision())));
+      framework::TransToPhiDataType(GetNativePrecisionType(src->precision())));
 }
 
 }  // namespace utils
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 9cefb24751e18dfbb3b8283152cbcd58c81adc58..46e6c18bfb8e31ee3b8bd8f225ebe15443eb9efc 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -88,5 +88,5 @@ class SoftMaxOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
index b6fdcddf309d85a68ea67f33c157fbcf5ce5affc..9cd5e81141598dda6ead275457c53feeb84c5fb8 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
@@ -45,4 +45,4 @@ TEST(SoftMaxOpConverter, main) {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
index ecf06e9bf15139990d5746a11592816ecde9f9f9..324e9c0392c9397837e05392bd7b0f755e6e14bf 100644
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -113,12 +113,12 @@ nvinfer1::DataType SpecialSlicePluginDynamic::getOutputDataType(
 template <typename T>
 __global__ void SpecialSliceKernel(const T* slice_input,
                                    const int32_t* cu_seqlens, T* output) {
-  const int hidden = blockDim.x * gridDim.y;
-  const int batch = blockIdx.x;
-  const int local_idx = blockIdx.y * blockDim.y + threadIdx.x;
+  const int hidden = blockDim.x * gridDim.x;
+  const int hidden_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int batch_id = blockIdx.y;
 
-  output[batch * hidden + local_idx] =
-      slice_input[cu_seqlens[batch] * hidden + local_idx];
+  output[batch_id * hidden + hidden_id] =
+      slice_input[cu_seqlens[batch_id] * hidden + hidden_id];
 }
 
 int SpecialSlicePluginDynamic::enqueue(
@@ -137,15 +137,16 @@ int SpecialSlicePluginDynamic::enqueue(
                                          "hidden should be multiple of 128."));
 
   constexpr int num_threads = 128;
-  const dim3 blocks(out_dims.d[0], hidden / num_threads);
-
   const half* slice_input = static_cast<const half*>(inputs[0]);
   const int32_t* cu_seqlens = static_cast<const int32_t*>(inputs[1]);
   half* output = static_cast<half*>(outputs[0]);
 
-  SpecialSliceKernel<<<blocks, num_threads, 0, stream>>>(slice_input,
-                                                         cu_seqlens, output);
+  const int32_t num_blocks_x = hidden / num_threads;
+  const int32_t num_blocks_y = out_dims.d[0];         // batchs
+  const dim3 num_blocks(num_blocks_x, num_blocks_y);  // blocks
 
+  SpecialSliceKernel<<<num_blocks, num_threads, 0, stream>>>(
+      slice_input, cu_seqlens, output);
   return cudaGetLastError() != cudaSuccess;
 }
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 85fe931cf93f85d3b25334bdb5ec2d0a62e37b30..37214534f3c937bcf62bb34b51da2c934c566ced 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -299,7 +299,9 @@ inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR}
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
 download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1)
 download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62)
-inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc)
+if (WITH_GPU)
+    inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc)
+endif()
 inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc)
 
 # Ernie large
@@ -551,7 +553,9 @@ endif()
 # bert, max_len=20, embedding_dim=128
 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
 download_model_and_data_without_verify(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
+if (WITH_GPU)
+    inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
+endif()
 
 # multiple models prediction
 set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction")
@@ -741,13 +745,15 @@ set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_mobilenet_transpose PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_ner PROPERTIES TIMEOUT 120)
-set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_ernie_int8 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_googlenet PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_small_dam PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120)
-set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT 120)
+if (WITH_GPU)
+    set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120)
+endif()
 if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
     if(WITH_MKLDNN)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 9b2aaa9308e5df7a1527d0fa217ab12ae1ecc156..4d0e485285146e5668793d29fd8effc789fcc339 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -493,7 +493,8 @@ class AllocatorFacadePrivate {
             "support allocating managed memory.\n"
             "If you don't actually need to use managed memory, please disable "
             "it with command `export FLAGS_use_cuda_managed_memory=false`.\n"
-            "Or you must use the gpu device that supports managed memory."));
+            "Or you must use the gpu device that supports managed memory.",
+            p.device));
       }
       return std::make_shared<CUDAManagedAllocator>(p);
     }
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index d86e5e35c08c0ef46ce86c0f372fc90f8df1811b..f5e4941d787097b5e349c0b668d6c95fad137873 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -18,6 +18,7 @@
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/platform/flags.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 PADDLE_DEFINE_EXPORTED_READONLY_bool(
     free_idle_chunk, false,
@@ -47,6 +48,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
 
 phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
     size_t unaligned_size) {
+  platform::RecordEvent("AutoGrowthBestFitAllocator::Allocate",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   size_t size = AlignedSize(unaligned_size, alignment_);
   VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
 
@@ -108,6 +111,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
 }
 
 void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
+  platform::RecordEvent("AutoGrowthBestFitAllocator::Free",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   VLOG(10) << "Free " << allocation->size()
            << " bytes, ptr = " << allocation->ptr();
   std::lock_guard<SpinLock> guard(spinlock_);
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index e7b86d6ec19c06d4ee9086590763f1afe23f99a9..8627e3e6f8811e162ce3014c01145f331a03ee4b 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace memory {
@@ -117,6 +118,8 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
 phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+  platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   ProcessUnfreedAllocations();
   VLOG(8) << "Try allocate " << size << " bytes";
   AllocationPtr underlying_allocation;
@@ -144,6 +147,8 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
 }
 
 void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
+  platform::RecordEvent("StreamSafeCUDAAllocator::Free",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
       dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
diff --git a/paddle/fluid/memory/cuda_managed_memory_test.cu b/paddle/fluid/memory/cuda_managed_memory_test.cu
index 4243c5fa90f7fad4f7a98a9d87545ef66cbe9875..f8c9ff82f57127d43bba8e7e03770dd3280832a8 100644
--- a/paddle/fluid/memory/cuda_managed_memory_test.cu
+++ b/paddle/fluid/memory/cuda_managed_memory_test.cu
@@ -128,6 +128,9 @@ TEST(ManagedMemoryTest, OversubscribeGPUMemoryTest) {
 }
 
 TEST(ManagedMemoryTest, OOMExceptionTest) {
+  if (!platform::IsGPUManagedMemorySupported(0)) {
+    return;
+  }
   EXPECT_THROW(Alloc(platform::CUDAPlace(0), size_t(1) << 60),
                memory::allocation::BadAlloc);
 }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index a279c76430f1b046a4c3ca05485824d5e3b62de2..91a0352e1915e95378012aa398ff996cbc10f216 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -100,7 +100,7 @@ else()
     cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten pten_api_utils gather_scatter_kernel)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel)
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index 915b4daeeb525f15e9db0f63d0f2212f31143fea..de4d7818020dd586547ff9eedb53108285048c09 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/addmm_op.h"
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -24,6 +27,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+constexpr int kMULMKLDNNINT8 = 1;
+
 using framework::OpKernelType;
 using framework::Tensor;
 
@@ -31,85 +36,6 @@ class AddMMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      platform::errors::NotFound(
-                          "Input(Input) of AddMMOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::NotFound("Input(X) of AddMMOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"), true,
-        platform::errors::NotFound("Input(Y) of AddMMOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound(
-                          "Output(Out) of AddMMOp should not be null."));
-
-    auto input_dims = ctx->GetInputDim("Input");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    auto ndim_input = input_dims.size();
-    auto ndim_x = x_dims.size();
-    auto ndim_y = y_dims.size();
-
-    float alpha = ctx->Attrs().Get<float>("Alpha");
-    float beta = ctx->Attrs().Get<float>("Beta");
-
-    VLOG(3) << "addmm operator input.shape=" << input_dims
-            << " x.shape=" << x_dims << " y.shape=" << y_dims
-            << " beta=" << beta << " alpha=" << alpha
-            << " ndim_input=" << ndim_input << " ndim_x=" << ndim_x
-            << " ndim_y=" << ndim_y;
-
-    PADDLE_ENFORCE_NE(phi::product(input_dims), 0,
-                      platform::errors::PreconditionNotMet(
-                          "The Input variable Input(%s) has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function.",
-                          ctx->Inputs("Input").front()));
-
-    PADDLE_ENFORCE_NE(phi::product(x_dims), 0,
-                      platform::errors::PreconditionNotMet(
-                          "The Input variable X(%s) has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function.",
-                          ctx->Inputs("X").front()));
-
-    PADDLE_ENFORCE_NE(phi::product(y_dims), 0,
-                      platform::errors::PreconditionNotMet(
-                          "The Input variable Y(%s) has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function.",
-                          ctx->Inputs("Y").front()));
-    // dim check
-    PADDLE_ENFORCE_EQ(ndim_input, 2,
-                      platform::errors::InvalidArgument(
-                          "The input tensor input's dimension must be 2. "
-                          "But received input's dimension = [%s].",
-                          ndim_input));
-    PADDLE_ENFORCE_EQ(ndim_x, 2,
-                      platform::errors::InvalidArgument(
-                          "The input tensor x's dimension must be 2. "
-                          "But received x's dimension = [%s].",
-                          ndim_x));
-    PADDLE_ENFORCE_EQ(ndim_y, 2,
-                      platform::errors::InvalidArgument(
-                          "The input tensor y's dimension must be 2. "
-                          "But received y's dimension = [%s].",
-                          ndim_y));
-
-    std::vector<int64_t> output_dims;
-    output_dims.push_back(x_dims[0]);
-    output_dims.push_back(y_dims[1]);
-
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dims));
-    ctx->ShareLoD("Input", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const {
     framework::LibraryType library = framework::LibraryType::kPlain;
@@ -221,17 +147,11 @@ class AddMMOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
+DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
+                            PT_INFER_META(phi::AddmmInferMeta));
 REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker,
                   ops::AddMMOpGradMaker<paddle::framework::OpDesc>,
-                  ops::AddMMOpGradMaker<paddle::imperative::OpBase>);
+                  ops::AddMMOpGradMaker<paddle::imperative::OpBase>,
+                  AddmmInferShapeFunctor);
 
 REGISTER_OPERATOR(addmm_grad, ops::AddMMGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    addmm, ops::AddMMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AddMMKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    addmm_grad, ops::AddMMGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AddMMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h
deleted file mode 100644
index 9d225ba99919249982924e382f2661d7481ed0c7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/addmm_op.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <boost/preprocessor/repetition/repeat.hpp>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
-using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
-
-using Tensor = framework::Tensor;
-
-constexpr int kMULMKLDNNINT8 = 1;
-
-template <typename DeviceContext, typename T>
-class AddMMKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* y = context.Input<Tensor>("Y");
-
-    auto input_dims = input->dims();
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-
-    // broadcast mode check
-    if (x_dims[0] != input_dims[0]) {
-      PADDLE_ENFORCE_EQ(input_dims[0], 1,
-                        platform::errors::InvalidArgument(
-                            "When x_dims[0] is not equal with input_dims[0], "
-                            "input_dims[0] must be 1 but got %s",
-                            input_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          y_dims[1] == input_dims[1] || input_dims[1] == 1, true,
-          platform::errors::InvalidArgument(
-              "The input tensor shape mismatch, input shape=[%s], "
-              "x shape=[%s], y shape=[%s]",
-              input_dims, x_dims, y_dims));
-    }
-    // broadcast mode check
-    if (y_dims[1] != input_dims[1]) {
-      PADDLE_ENFORCE_EQ(input_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "When y_dims[1] is not equal with input_dims[0], "
-                            "input_dims[0] must be 1 but got %s",
-                            input_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          x_dims[0] == input_dims[0] || input_dims[0] == 1, true,
-          platform::errors::InvalidArgument(
-              "The input tensor shape mismatch, input shape=[%s], "
-              "x shape=[%s], y shape=[%s]",
-              input_dims, x_dims, y_dims));
-    }
-    // broadcast mode check
-    PADDLE_ENFORCE_EQ(
-        x_dims[1], y_dims[0],
-        platform::errors::InvalidArgument(
-            "The input tensor X's width must be equal with matrix Y' height. "
-            "But received X's shape = [%s], Y's shape = [%s].",
-            x_dims[1], y_dims[0]));
-
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>({x_dims[0], y_dims[1]}, context.GetPlace());
-
-    float alpha = context.template Attr<float>("Alpha");
-    float beta = context.template Attr<float>("Beta");
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
-
-    // calc broadcast dim
-    Array2 bcast_dims;
-    bcast_dims[0] = x_dims[0] / input_dims[0];
-    bcast_dims[1] = y_dims[1] / input_dims[1];
-    VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]";
-    // broadcast using eigen
-    auto eigen_input = EigenTensor<T, 2>::From(*input);
-    auto eigen_out = EigenTensor<T, 2>::From(*out);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
-        place, eigen_out, eigen_input, bcast_dims);
-
-    blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha,
-              x->data<T>(), x_dims[1], y->data<T>(), y_dims[1], beta,
-              out->data<T>(), y_dims[1]);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AddMMGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto in_dims = ctx.Input<framework::LoDTensor>("Input")->dims();
-    auto* dinput =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Input"));
-    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
-
-    float alpha = ctx.Attr<float>("Alpha");
-    float beta = ctx.Attr<float>("Beta");
-
-    int total_elems = 0;
-
-    VLOG(3) << "alpha: " << alpha << " beta: " << beta;
-
-    if (dinput != nullptr) {
-      dinput->set_lod(dout->lod());
-    }
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    if (dinput) {
-      dinput->mutable_data<T>(ctx.GetPlace());
-      total_elems = in_dims[0] * in_dims[1];
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto eigen_dout = EigenTensor<T, 2>::From(*dout);
-      auto eigen_dinput = EigenTensor<T, 2>::From(*dinput);
-
-      bool row_compress = in_dims[0] != dout->dims()[0];
-      bool col_compress = in_dims[1] != dout->dims()[1];
-      auto eigen_dinput_shape = Array2(dinput->dims()[0], dinput->dims()[1]);
-
-      if (row_compress && col_compress) {
-        eigen_dinput.device(place) =
-            eigen_dout.sum().eval().reshape(eigen_dinput_shape);
-      } else if (row_compress) {
-        eigen_dinput.device(place) =
-            eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape);
-      } else if (col_compress) {
-        eigen_dinput.device(place) =
-            eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape);
-      } else {
-        blas.VCOPY(total_elems, dout->data<T>(), dinput->data<T>());
-      }
-
-      blas.SCAL(total_elems, beta, dinput->data<T>());
-    }
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      total_elems = x->dims()[0] * x->dims()[1];
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(*dout, false, *y, true, dx);
-      blas.SCAL(total_elems, alpha, dx->data<T>());
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      total_elems = x->dims()[1] * y->dims()[1];
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(*x, true, *dout, false, dy);
-      blas.SCAL(total_elems, alpha, dy->data<T>());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index 915ad2f41cde33ee9519b06b38bb8a59fd37793b..4b1593b1f8b40c0c4380007f85f9bb74fea9cd44 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 253a96004bd30a2d6c0da456c578e8dc4b522cca..4774c0a1dbc3b78607d75efb7bc82d590ca4aa2a 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -12,84 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
-#include <memory>
-#include <string>
-#include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class BilinearTensorProductOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::InvalidArgument("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"), true,
-        platform::errors::InvalidArgument("Input(Y) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Weight"), true,
-        platform::errors::InvalidArgument("Input(Weight) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::InvalidArgument("Output(Out) should not be null."));
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto weight_dims = ctx->GetInputDim("Weight");
-
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(), 2UL,
-        platform::errors::InvalidArgument("The input(X) must be a 2D Tensor."));
-    PADDLE_ENFORCE_EQ(
-        y_dims.size(), 2UL,
-        platform::errors::InvalidArgument("The input(Y) must be a 2D Tensor."));
-    PADDLE_ENFORCE_EQ(
-        weight_dims.size(), 3UL,
-        platform::errors::InvalidArgument("Expected the input(Weight) is a 3D "
-                                          "tensor. But received %dD tensor.",
-                                          weight_dims.size()));
-    if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0)) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0], y_dims[0],
-          platform::errors::InvalidArgument(
-              "The first dimension(batch_size) of input(X) must be "
-              "equal to the first dimension of the input(Y)."));
-    }
-    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
-                      platform::errors::InvalidArgument(
-                          "The second dimension of input(X) must be equal to "
-                          "the second dimension of the input(Weight)."));
-    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
-                      platform::errors::InvalidArgument(
-                          "The second dimension of input(Y) must be equal to "
-                          "the third dimension of the input(Weight)."));
-
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      PADDLE_ENFORCE_EQ(bias_dims.size(), 2UL,
-                        platform::errors::InvalidArgument(
-                            "The Input(Bias) must be a 2-D tensor with "
-                            "the 2nd dimension fixed to 1 (a row vector)."));
-      PADDLE_ENFORCE_EQ(bias_dims[0], 1UL,
-                        platform::errors::InvalidArgument(
-                            "The Input(Bias) must be a 2-D tensor with "
-                            "the 2nd dimension fixed to 1 (a row vector)."));
-      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The second dimension of input(Bias) must be equal "
-                            "to the first dimension of the input(Weight)."));
-    }
-
-    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -125,59 +59,6 @@ Where $W_i$ is the $i$-th slice of Input(Weight);
 class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::InvalidArgument("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"), true,
-        platform::errors::InvalidArgument("Input(Y) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Weight"), true,
-        platform::errors::InvalidArgument("Input(Weight) should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) should not be null."));
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
-                      platform::errors::InvalidArgument(
-                          "The input(Out@GRAD) must be a 2D Tensor."));
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], out_dims[0],
-        platform::errors::InvalidArgument(
-            "The first dimension(batch_size) of input(Out@GRAD) must be "
-            "equal to the first dimension of the Input(X)."));
-    PADDLE_ENFORCE_EQ(
-        weight_dims[0], out_dims[1],
-        platform::errors::InvalidArgument(
-            "The second dimension of input(Out@GRAD) must be equal to "
-            "the third dimension of the Input(Weight)."));
-
-    auto bias_grad_name = framework::GradVarName("Bias");
-    if (ctx->HasOutput(bias_grad_name)) {
-      ctx->SetOutputDim(bias_grad_name, {1, out_dims[1]});
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    auto weight_grad_name = framework::GradVarName("Weight");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-    if (ctx->HasOutput(weight_grad_name)) {
-      ctx->SetOutputDim(weight_grad_name, weight_dims);
-    }
-  }
 };
 
 template <typename T>
@@ -208,21 +89,20 @@ class BilinearTensorProductGradOpMaker
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
+                            BilinearTensorProductInferShapeFunctor,
+                            PT_INFER_META(phi::BilinearTensorProductInferMeta));
+DELCARE_INFER_SHAPE_FUNCTOR(
+    bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor,
+    PT_INFER_META(phi::BilinearTensorProductGradInferMeta));
+
 REGISTER_OPERATOR(
     bilinear_tensor_product, ops::BilinearTensorProductOp,
     ops::BilinearTensorProductOpMaker,
     ops::BilinearTensorProductGradOpMaker<paddle::framework::OpDesc>,
-    ops::BilinearTensorProductGradOpMaker<paddle::imperative::OpBase>);
+    ops::BilinearTensorProductGradOpMaker<paddle::imperative::OpBase>,
+    BilinearTensorProductInferShapeFunctor);
 REGISTER_OPERATOR(bilinear_tensor_product_grad,
-                  ops::BilinearTensorProductOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-REGISTER_OP_CPU_KERNEL(
-    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
-                                         float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
-                                         double>);
+                  ops::BilinearTensorProductOpGrad,
+                  BilinearTensorProductGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu
deleted file mode 100644
index c2b4f69e6854522b91dfd9fb5f738c0e5ffc77b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
-REGISTER_OP_CUDA_KERNEL(
-    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
-                                         float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
-                                         double>);
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
deleted file mode 100644
index 2dbe3a132d78aed1593041bd83f682250f79596c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class BilinearTensorProductKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto y_mat = EigenMatrix<T>::From(*y);
-    auto output_mat = EigenMatrix<T>::From(*out);
-
-    auto batch_size = x->dims()[0];
-    auto weight_dims = weight->dims();
-    int out_dim = weight_dims[0];
-    auto x_dim = weight_dims[1];
-    auto y_dim = weight_dims[2];
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    // Create the intermediate variable to calculate the result of
-    // Input(X) multiplied by Input(Weight_i), the formula is:
-    // left_mul = X Weight_i.
-    Tensor left_mul;
-    left_mul.mutable_data<T>(phi::make_ddim({batch_size, y_dim}),
-                             ctx.GetPlace());
-    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
-
-    for (int i = 0; i < out_dim; ++i) {
-      auto output_col_vec = output_mat.chip(i, 1);
-      Tensor weight_mat =
-          weight->Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim}));
-      phi::funcs::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
-          CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
-          weight_mat.data<T>(), 0, left_mul.data<T>());
-      output_col_vec.device(place) =
-          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
-    }
-    if (bias) {
-      auto bias_vec = EigenMatrix<T>::From(*bias);
-      Eigen::DSizes<int, 2> bcast(batch_size, 1);
-      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor* weight = ctx.Input<Tensor>("Weight");
-    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto batch_size = x->dims()[0];
-    auto weight_dims = weight->dims();
-    int out_dim = weight_dims[0];
-    auto x_dim = weight_dims[1];
-    auto y_dim = weight_dims[2];
-
-    auto x_mat = EigenMatrix<T>::From(*x);
-    auto y_mat = EigenMatrix<T>::From(*y);
-    auto d_out_mat = EigenMatrix<T>::From(*d_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // Create the intermediate variable to calculate the Output(Y@Grad).
-    Tensor x_scale;
-    x_scale.mutable_data<T>(phi::make_ddim({batch_size, x_dim}),
-                            ctx.GetPlace());
-    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
-
-    // Create the intermediate variable to calculate the Output(X@Grad).
-    Tensor y_scale;
-    y_scale.mutable_data<T>(phi::make_ddim({batch_size, y_dim}),
-                            ctx.GetPlace());
-    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_x, static_cast<T>(0));
-    }
-
-    if (d_y) {
-      d_y->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_y, static_cast<T>(0));
-    }
-
-    if (d_weight) {
-      d_weight->mutable_data<T>(ctx.GetPlace());
-    }
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-    // Caculate the Output(X@Grad) and Output(Y@Grad).
-    if (d_x || d_y || d_weight) {
-      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
-      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
-      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
-
-      for (int i = 0; i < out_dim; ++i) {
-        Tensor weight_i =
-            weight->Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim}));
-        auto output_vec = d_out_mat.chip(i, 1);
-
-        if (d_x) {
-          y_scale_mat.device(place) =
-              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_x) *
-              y_mat;
-          blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
-                    y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
-        }
-
-        if (d_y || d_weight) {
-          auto output_vec_y =
-              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_y);
-          x_scale_mat.device(place) = output_vec_y * x_mat;
-          if (d_y) {
-            blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-                      x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
-          }
-          if (d_weight) {
-            Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
-                phi::make_ddim({x_dim, y_dim}));
-            blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
-                      x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
-          }
-        }
-      }
-    }
-
-    // calculate the gradient of Input(Bias).
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
-      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 4ca0dded3e7385234e3dc630e6260c08fb45f3a8..bc6cf9d831ff0faf00d3db7fdc6105f301781f8b 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -138,7 +138,7 @@ class CastOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
-// cast use pten kernel, so no need to REGISTER_OP_CPU_KERNEL here.
+// cast use phi kernel, so no need to REGISTER_OP_CPU_KERNEL here.
 REGISTER_OPERATOR(cast, ops::CastOp,
                   ops::CastOpGradMaker<paddle::framework::OpDesc>,
                   ops::CastOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 62d747cb9f4001e4fcee64a49ee8a16a49eb2617..034cb47fab189b3c7a712d4d720887de227d8573 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -63,12 +63,12 @@ class CastOpKernel : public framework::OpKernel<InT> {
     out->mutable_data(dev_ctx.GetPlace(),
                       static_cast<framework::proto::VarType::Type>(out_dtype));
 
-    auto pt_out_dtype = framework::TransToPtenDataType(
+    auto pt_out_dtype = framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
 
     // call new kernel
     phi::CastKernel<InT>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *in, pt_out_dtype, out);
   }
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index 25b3a446a0a32e61407d2ffa796c30d9a6625532..64324d9772b47de8dfec256f75f60873ce6aafeb 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -46,11 +46,11 @@ class CastXPUKernel : public framework::OpKernel<InT> {
     out->mutable_data(dev_ctx.GetPlace(),
                       static_cast<framework::proto::VarType::Type>(out_dtype));
 
-    auto pt_out_dtype = framework::TransToPtenDataType(
+    auto pt_out_dtype = framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
-    // call pten kernel
+    // call phi kernel
     phi::CastKernel<InT>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *in, pt_out_dtype, out);
   }
diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc
index 0902f5b6bc9e80adfb990c0bc6e80d12db408ea9..09e915a6bafd4a8b72f35995b3ebbfeafa00476a 100644
--- a/paddle/fluid/operators/cholesky_op.cc
+++ b/paddle/fluid/operators/cholesky_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cholesky_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,26 +26,6 @@ using framework::Tensor;
 class CholeskyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cholesky");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cholesky");
-    auto dims = ctx->GetInputDim("X");
-    auto rank = dims.size();
-    PADDLE_ENFORCE_GE(rank, 2,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) should have at least 2 dimensions. But "
-                          "received a %d dimension tensor.",
-                          rank));
-    PADDLE_ENFORCE_EQ(
-        dims[rank - 2], dims[rank - 1],
-        platform::errors::InvalidArgument(
-            "The inner-most 2 dimensions of Input(X) all should be symmetric "
-            "positive-definite matrices and have the same size. But received "
-            "X's shape[-2] = %d and shape[-1] = %d.",
-            dims[rank - 2], dims[rank - 1]));
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
 };
 
 class CholeskyOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -107,15 +90,10 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
+                            PT_INFER_META(phi::CholeskyInferMeta));
 REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker,
                   ops::CholeskyGradOpMaker<paddle::framework::OpDesc>,
-                  ops::CholeskyGradOpMaker<paddle::imperative::OpBase>);
+                  ops::CholeskyGradOpMaker<paddle::imperative::OpBase>,
+                  CholeskyInferShapeFunctor);
 REGISTER_OPERATOR(cholesky_grad, ops::CholeskyGradOp);
-
-REGISTER_OP_CPU_KERNEL(cholesky, ops::CholeskyCPUKernel<float>,
-                       ops::CholeskyCPUKernel<double>);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_grad,
-    ops::CholeskyGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskyGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu
deleted file mode 100644
index 43c16d607c2dbaefdcb576a07ad607f934b0f08e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cholesky_op.cu
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-
-#include <thrust/device_vector.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/cholesky_op.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CholeskyGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    bool upper = context.Attr<bool>("upper");
-    auto& dims = x->dims();
-    int batch_count = 1;
-    for (int i = 0; i < dims.size() - 2; i++) {
-      batch_count *= dims[i];
-    }
-    int m = dims[dims.size() - 1];
-    int tensor_size = batch_count * m * m;
-
-    const auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-
-    // matrices are assumed to be stored in column-major order in cusolver
-    cublasFillMode_t uplo =
-        upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
-    // portf is inplace, thus copy the triangular part of the input matrices to
-    // the output and set the other triangular part to 0 firstly
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              tensor_size);
-    if (upper) {
-      MatrixBandPartFunctor<T> matrix_band_part_functor(
-          m, m, /* num_lower_diags */ 0, /* num_upper_diags */ m, x_data,
-          out_data);
-      for_range(matrix_band_part_functor);
-    } else {
-      MatrixBandPartFunctor<T> matrix_band_part_functor(
-          m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0, x_data,
-          out_data);
-      for_range(matrix_band_part_functor);
-    }
-
-    auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
-    auto* info_ptr = reinterpret_cast<int*>(info->ptr());
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-    if (batch_count > 1) {
-      std::vector<T*> output_ptrs;
-      for (int i = 0; i < batch_count; i++) {
-        output_ptrs.emplace_back(out_data + i * m * m);
-      }
-      thrust::device_vector<T*> dev_output_ptrs(output_ptrs.begin(),
-                                                output_ptrs.end());
-      PotrfBatched(dev_ctx, uplo, m,
-                   thrust::raw_pointer_cast(dev_output_ptrs.data()), m,
-                   info_ptr, batch_count);
-      // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need
-      // to clear the upper triangle of the output. Remove this workaround once
-      // the bug is fixed.
-      if (!upper) {
-        MatrixBandPartFunctor<T> matrix_band_part_functor(
-            m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0, out_data,
-            out_data);
-        for_range(matrix_band_part_functor);
-      }
-    } else {
-#endif
-      for (int i = 0; i < batch_count; i++) {
-        Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i);
-      }
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-    }
-#endif
-    // check the info
-    std::vector<int> error_info;  // only for checking positive matrix
-    error_info.resize(batch_count);
-
-    memory::Copy(platform::CPUPlace(), error_info.data(), dev_ctx.GetPlace(),
-                 info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
-
-    for (int i = 0; i < batch_count; ++i) {
-      PADDLE_ENFORCE_EQ(error_info[i], 0,
-                        platform::errors::PreconditionNotMet(
-                            "For batch [%d]: U(%d, %d) is zero, singular U.", i,
-                            error_info[i], error_info[i]));
-    }
-  }
-
-  void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,
-             int n, T* A, int lda, int* info) const;
-
-  void PotrfBatched(const platform::CUDADeviceContext& dev_ctx,
-                    cublasFillMode_t uplo, int n, T* Aarray[], int lda,
-                    int* info_array, int batch_size) const;
-};
-
-#define FUNC_WITH_TYPES(m) m(float, S) m(double, D)
-
-#define POTRF_INSTANCE(T, C)                                                   \
-  template <>                                                                  \
-  void CholeskyGPUKernel<T>::Potrf(const platform::CUDADeviceContext& dev_ctx, \
-                                   cublasFillMode_t uplo, int n, T* A,         \
-                                   int lda, int* info) const {                 \
-    auto handle = dev_ctx.cusolver_dn_handle();                                \
-    int workspace_size = 0;                                                    \
-    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
-        platform::dynload::cusolverDn##C##potrf_bufferSize(                    \
-            handle, uplo, n, A, lda, &workspace_size));                        \
-    auto workspace = memory::Alloc(dev_ctx, workspace_size);                   \
-    T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());                 \
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrf(        \
-        handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));        \
-  }
-
-FUNC_WITH_TYPES(POTRF_INSTANCE);
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-#define POTRF_BATCH_INSTANCE(T, C)                                             \
-  template <>                                                                  \
-  void CholeskyGPUKernel<T>::PotrfBatched(                                     \
-      const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,       \
-      int n, T* Aarray[], int lda, int* info_array, int batch_size) const {    \
-    auto handle = dev_ctx.cusolver_dn_handle();                                \
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrfBatched( \
-        handle, uplo, n, Aarray, lda, info_array, batch_size));                \
-  }
-
-FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
-#endif
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cholesky, ops::CholeskyGPUKernel<float>,
-                        ops::CholeskyGPUKernel<double>);
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_grad,
-    ops::CholeskyGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskyGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/cholesky_op.h b/paddle/fluid/operators/cholesky_op.h
deleted file mode 100644
index 9504909073f7911c305ef952bca49b5b0bbca47f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cholesky_op.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <numeric>
-#include <vector>
-#include "Eigen/Cholesky"
-#include "Eigen/Core"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class CholeskyCPUKernel : public framework::OpKernel<T> {
- public:
-  // different with EigenMatrix in framework/eigen.h
-  using EigenMatrix =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using InputMatrixMap = Eigen::Map<const EigenMatrix>;
-  using OutputMatrixMap = Eigen::Map<EigenMatrix>;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    bool upper = context.Attr<bool>("upper");
-    auto& dims = x->dims();
-    int batch_count = 1;
-    for (int i = 0; i < dims.size() - 2; i++) {
-      batch_count *= dims[i];
-    }
-    auto m = dims[dims.size() - 1];
-
-    const auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    // Cholesky decomposition for each matrix, maybe can use multi threads
-    for (int i = 0; i < batch_count; i++) {
-      auto input = InputMatrixMap(x_data + i * m * m, m, m);
-      auto output = OutputMatrixMap(out_data + i * m * m, m, m);
-      if (upper) {
-        Eigen::LLT<
-            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
-            Eigen::UpLoType::Upper>
-            llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
-                          platform::errors::InvalidArgument(
-                              "Cholesky decomposition was not successful. The "
-                              "%d-th input matrice "
-                              "might not be not be positive definite.",
-                              i));
-        output = llt_decomposition.matrixU();
-      } else {
-        Eigen::LLT<
-            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
-            Eigen::UpLoType::Lower>
-            llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
-                          platform::errors::InvalidArgument(
-                              "Cholesky decomposition was not successful. The "
-                              "%d-th input matrice "
-                              "might not be not be positive definite.",
-                              i));
-        output = llt_decomposition.matrixL();
-      }
-    }
-  }
-};
-
-/*! Use these functors to implement tril, triu, diagonal and other operators */
-template <typename T>
-struct EyeFunctor {
-  EyeFunctor(const int m, const int n, T* output)
-      : m_(m), n_(n), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int global_row = index / n_;
-    const int col = index - global_row * n_;
-    const int batch = global_row / m_;
-    const int row = global_row - batch * m_;
-    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
-  }
-
-  const int m_, n_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixBandPartFunctor {
-  /*! Set output as input value outside a central band and 0 inside that band.
-   * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n]
-   * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper
-   * < 0 || (n-m) <= num_upper)
-   */
-  MatrixBandPartFunctor(const int m, const int n, const int num_lower_diags,
-                        const int num_upper_diags, const T* input, T* output)
-      : m_(m),
-        n_(n),
-        num_lower_diags_(num_lower_diags),
-        num_upper_diags_(num_upper_diags),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int col = index % n_;
-    const int row = (index / n_) % m_;
-    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
-    const int band_end =
-        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
-    if (col < band_start || col >= band_end) {
-      output_[index] = static_cast<T>(0);
-    } else {
-      output_[index] = input_[index];
-    }
-  }
-
-  const int m_, n_, num_lower_diags_, num_upper_diags_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixSetDiagFunctor {
-  /*! Overwrite specified diagonals of output by the values in diagonal.
-   * diagonals can be a central band specified by num_diags and
-   * upper_diag_index, where upper_diag_index=0 refers to the main diagonal,
-   * positive value means superdiagonal and negative value means subdiagonal.
-   * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len]
-   * and the num_diags diagonals has a up to down layout. Otherwise it has a
-   * shape [i, j, ..., max_diag_len].
-   */
-  MatrixSetDiagFunctor(const int m, const int n, const int num_diags,
-                       const int max_diag_len, const int upper_diag_index,
-                       const T* diag, T* output)
-      : m_(m),
-        n_(n),
-        num_diags_(num_diags),
-        max_diag_len_(max_diag_len),
-        upper_diag_index_(upper_diag_index),
-        diag_(diag),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int batch_and_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_diag_index * max_diag_len_;
-    const int batch = batch_and_diag_index / num_diags_;
-    const int diag_index_in_input = batch_and_diag_index - batch * num_diags_;
-    // diag_index=0 refers to the main diagonal
-    const int diag_index = upper_diag_index_ - diag_index_in_input;
-    // shift down for subdiagonal if diag_index < 0
-    const int y_index =
-        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
-    // shift right for superdiagonal if diag_index > 0
-    const int x_index =
-        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
-
-    // Upper-bound checks for diagonals shorter than max_diag_len.
-    // y_index and x_index are nonnegative by construction.
-    if (y_index < m_ && x_index < n_) {
-      const int out_index = batch * m_ * n_ + y_index * n_ + x_index;
-      output_[out_index] = diag_[index];
-    }
-  }
-
-  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
-  const T* diag_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixDiagPartFunctor {
-  /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0
-   * refers to the main diagonal, positive value means superdiagonal and
-   * negative value means subdiagonal */
-  MatrixDiagPartFunctor(const int m, const int n, const int num_diags,
-                        const int max_diag_len, const int upper_diag_index,
-                        const T padding, const T* input, T* output)
-      : m_(m),
-        n_(n),
-        num_diags_(num_diags),
-        max_diag_len_(max_diag_len),
-        upper_diag_index_(upper_diag_index),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int batch_and_mapped_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_mapped_diag_index * max_diag_len_;
-    const int batch = batch_and_mapped_diag_index / num_diags_;
-    const int mapped_diag_index =
-        batch_and_mapped_diag_index - batch * num_diags_;
-    // diag_index=0 refers to the main diagonal
-    const int diag_index = upper_diag_index_ - mapped_diag_index;
-    // shift down for subdiagonal if diag_index < 0
-    const int y_index =
-        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
-    // shift right for superdiagonal if diag_index > 0
-    const int x_index =
-        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
-    if (y_index < m_ && x_index < n_) {
-      output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index];
-    } else {
-      output_[index] = padding_;
-    }
-  }
-
-  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
-  const T padding_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixBandPartScaleEndFunctor {
-  /*! Compared with MatrixBandPartFunctor, it scale up values at the end of
-   * band. It can be used to fuse the following operations, which actually
-   * output triangular with diagonal scaled up:
-   * 1. dig = matrix_diag_part(middle)
-   * 2. middle = matrix_set_diag(middle, diag * scalar)
-   * 3. middle = matrix_band_part(middle, -1, 0)
-   */
-  MatrixBandPartScaleEndFunctor(const int m, const int n,
-                                const int num_lower_diags,
-                                const int num_upper_diags, const T scale,
-                                const T* input, T* output)
-      : m_(m),
-        n_(n),
-        num_lower_diags_(num_lower_diags),
-        num_upper_diags_(num_upper_diags),
-        scale_(scale),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int col = index % n_;
-    const int row = (index / n_) % m_;
-    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
-    const int band_end =
-        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
-    if (col < band_start || col >= band_end) {
-      output_[index] = 0;
-    } else if (col == band_end - 1) {
-      output_[index] = scale_ * input_[index];
-    } else {
-      output_[index] = input_[index];
-    }
-  }
-
-  const int m_, n_, num_lower_diags_, num_upper_diags_;
-  const T scale_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct AddtoScaleFunctor {
-  AddtoScaleFunctor(const T scale, const T* input, T* output)
-      : scale_(scale), input_(input), output_(output) {}
-  HOSTDEVICE void operator()(size_t index) const {
-    output_[index] += input_[index];
-    output_[index] *= scale_;
-  }
-  const T scale_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-class CholeskyGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Input<Tensor>("Out");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-
-    bool upper = context.Attr<bool>("upper");
-    auto& dims = out->dims();
-    int batch_count = 1;
-    for (int i = 0; i < dims.size() - 2; i++) {
-      batch_count *= dims[i];
-    }
-    auto m = dims[dims.size() - 1];
-    int tensor_size = batch_count * m * m;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    std::vector<int> axis(dims.size() - 2);
-    std::iota(axis.begin(), axis.end(), 0);
-    axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2});
-    Tensor l, l_grad;
-    if (upper) {
-      l.mutable_data<T>(dims, context.GetPlace());
-      l_grad.mutable_data<T>(dims, context.GetPlace());
-      TransCompute<DeviceContext, T>(dims.size(), dev_ctx, *out, &l, axis);
-      TransCompute<DeviceContext, T>(dims.size(), dev_ctx, *out_grad, &l_grad,
-                                     axis);
-    } else {
-      l = *out;
-      l_grad = *out_grad;
-    }
-    auto* l_data = l.data<T>();
-
-    /*！ refer to Iain Murray (2016); arXiv 1602.07527 */
-    /*! phi = matmul(L.transpose(-1, -2), grad) */
-    Tensor middle;
-    auto* middle_data = middle.mutable_data<T>(dims, context.GetPlace());
-    auto trans_desc = phi::funcs::CreateMatrixDescriptor(dims, 0, true);
-    auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(dims, 0, false);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
-    blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0));
-
-    /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */
-    platform::ForRange<DeviceContext> for_range(dev_ctx, tensor_size);
-    MatrixBandPartScaleEndFunctor<T> matrix_band_part_scale_end_functor(
-        m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0,
-        /* scale */ 0.5, middle_data, middle_data);
-    for_range(matrix_band_part_scale_end_functor);
-
-    // Compute inverse by solving the triangular linear system AX = B, where B
-    // is the identity matrix. The matrix X would be overwritten on B
-    Tensor identity;
-    auto* identity_data = identity.mutable_data<T>(dims, context.GetPlace());
-    EyeFunctor<T> eye_functor(m, m, identity_data);
-    for_range(eye_functor);
-    // TODO(guosheng): use trsmBatched for GPU
-    for (int i = 0; i < batch_count; i++) {
-      blas.TRSM(/*side*/ CblasLeft, /*uplo*/ CblasLower,
-                /*trans*/ CblasNoTrans, /*diag*/ CblasNonUnit, /*m*/ m, /*n*/ m,
-                /*alpha*/ T(1), l_data + i * m * m, /*lda*/ m,
-                identity_data + i * m * m, /*ldb*/ m);
-    }
-    Tensor& l_inverse = identity;
-
-    /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */
-    Tensor middle1;
-    middle1.mutable_data<T>(dims, context.GetPlace());
-    blas.MatMul(l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1,
-                T(0));
-    blas.MatMul(middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad,
-                T(0));
-
-    /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */
-    Tensor x_grad_trans;
-    auto* x_grad_trans_data =
-        x_grad_trans.mutable_data<T>(dims, context.GetPlace());
-    TransCompute<DeviceContext, T>(dims.size(), dev_ctx, *x_grad, &x_grad_trans,
-                                   axis);
-    AddtoScaleFunctor<T> addto_scale_functor(0.5, x_grad_trans_data,
-                                             x_grad_data);
-    for_range(addto_scale_functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
index 86ed7574654959849beb0c1d547a736ad9e1546c..f25fbbb0c698036951c4b9ae8e9ad2778786a1a2 100644
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -203,7 +203,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
       commonterm_conj = helper.Transpose(commonterm_conj);
 
       phi::AddRawKernel<T>(
-          static_cast<const typename paddle::framework::ConvertToPtenContext<
+          static_cast<const typename paddle::framework::ConvertToPhiContext<
               DeviceContext>::TYPE &>(dev_ctx),
           commonterm, commonterm_conj, -1, &commonterm);
 
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index a2fc080faadcf9c24ccc703524cd71da92ce7cdb..f1247ebdf23c8e00cdbfd662a160912a769d7558 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -1,13 +1,13 @@
 include(operators)
 
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
-cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope cinn)
+cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn)
 
 SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
 register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
-  cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope cinn_launch_context)
+  cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope proto_desc graph cinn_launch_context cinn_instruction_run_op cinn)
   set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
   SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda")
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 0b677f79f7f5d7b9a4a9b2627890e1a42745113a..0a21d937aa1a70120e6112cdb291aa41eb222bb3 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -17,22 +17,39 @@
 #include <functional>
 #include <utility>
 #include <vector>
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
 #include "cinn/hlir/framework/tensor.h"
 #include "cinn/runtime/cinn_runtime.h"
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators::details {
 
-using LoDTensor = framework::LoDTensor;
+using framework::Scope;
+using framework::LoDTensor;
+using framework::ParallelExecutor;
+using CinnInstruction = ::cinn::hlir::framework::Instruction;
+using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
+using framework::paddle2cinn::Name2VarInfoMap;
+using framework::paddle2cinn::kMemOptVarInfoFromMainGraph;
 
-CinnLaunchContext::CinnLaunchContext(
-    const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
-    const std::shared_ptr<CinnScope>& cinn_scope)
-    : cinn_scope_(cinn_scope) {
-  // generate all names of the cinn execution arguments
+CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
+                                     const CinnCompiledObject& compiled_obj)
+    : cinn_scope_(compiled_obj.scope) {
+  // collect all names of the CINN execution arguments
   auto var_names = cinn_scope_->var_names();
   cinn_argument_names_.reserve(var_names.size());
   std::transform(
@@ -40,7 +57,42 @@ CinnLaunchContext::CinnLaunchContext(
       std::inserter(cinn_argument_names_, cinn_argument_names_.end()),
       [](const auto& name_view) { return std::string(name_view.data()); });
   // build name map between the original variables and compiled ones
-  BuildVarNameMap(paddle2cinn_varmap, cinn_argument_names_);
+  BuildVarNameMap(compiled_obj.paddle2cinn_varmap, cinn_argument_names_);
+
+  const auto& input_var_names =
+      graph.Get<std::vector<std::string>>(framework::paddle2cinn::kInputVars);
+  const auto& output_var_names =
+      graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
+  internal_var_names_ =
+      ExtractInternalVarNames(input_var_names, output_var_names);
+  // check completeness of output variables in compiled result
+  for (auto&& var_name : output_var_names) {
+    PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
+                      platform::errors::PreconditionNotMet(
+                          "Variable(%s) not applied in CINN", var_name));
+  }
+
+  // initialize all execution arguments
+  InitializeArguments();
+  // DEPRECATED(CtfGo): following callback assignment will be deprecated soon
+  for (auto&& var_name : input_var_names) {
+    if (IsVariableUsed(var_name)) {
+      AssignExternalVariable(var_name);
+    }
+  }
+  for (auto&& var_name : output_var_names) {
+    AssignExternalVariable(var_name);
+  }
+  for (auto&& var_name : internal_var_names_) {
+    AssignInternalVariable(var_name);
+  }
+
+  // Convert the CINN runtime program to a Paddle graph
+  runtime_graph_ = std::make_unique<framework::ir::Graph>(
+      BuildCompiledProgram(graph, compiled_obj));
+  runtime_graph_->SetNotOwned<Name2VarInfoMap>(
+      kMemOptVarInfoFromMainGraph,
+      &graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph));
 }
 
 void CinnLaunchContext::BuildVarNameMap(
@@ -94,21 +146,15 @@ void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
           << std::addressof(place);
 }
 
-bool CinnLaunchContext::IsArgumentsInitialized() const {
-  if (hold_buffers_.empty() || name2argument_.empty()) {
-    return false;
-  }
-  return true;
-}
-
 bool CinnLaunchContext::IsVariableUsed(const std::string& var_name) const {
   return paddle2cinn_varmap_.count(var_name) > 0;
 }
 
-CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& arg_name) {
-  PADDLE_ENFORCE_GT(cinn_argument_names_.count(arg_name), 0,
-                    platform::errors::InvalidArgument(
-                        "Variable(%s) not found in cinn scope.", arg_name));
+CinnTensor CinnLaunchContext::GetCinnTensorOfVar(const std::string& var_name) {
+  PADDLE_ENFORCE_EQ(
+      IsVariableUsed(var_name), true,
+      platform::errors::NotFound("Variable(%s) not applied in CINN", var_name));
+  const auto& arg_name = paddle2cinn_varmap_.at(var_name);
   return cinn_scope_->GetTensor(arg_name);
 }
 
@@ -132,10 +178,13 @@ std::unordered_set<std::string> CinnLaunchContext::ExtractInternalVarNames(
   return remain_var_names;
 }
 
-void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name,
-                                              const LoDTensor& paddle_tensor,
-                                              const CinnTensor& cinn_tensor) {
+void CinnLaunchContext::CheckTensorEquivalent(
+    const std::string& var_name, const framework::LoDTensor& paddle_tensor) {
+  PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
+                    platform::errors::InvalidArgument(
+                        "Variable(%s) not applied in cinn", var_name));
   // check dimension
+  auto cinn_tensor = GetCinnTensorOfVar(var_name);
   auto cinn_dims = phi::make_ddim(cinn_tensor->shape().data());
   PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
                     platform::errors::PreconditionNotMet(
@@ -146,22 +195,28 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name,
   // TODO(CtfGo): check the underlying data type after CINN ready
 }
 
+void CinnLaunchContext::InitializeArguments() {
+  for (auto&& arg : cinn_argument_names_) {
+    auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+    auto cinn_tensor = GetCinnTensorOfVar(cinn2paddle_varmap_.at(arg));
+    // assign dimensions with corresponding compiled tensor
+    cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                        cinn_tensor->shape().data().size());
+    VLOG(4) << string::Sprintf(
+        "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg,
+        framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
+        name2argument_.size());
+    name2argument_.emplace(arg, cinn_buffer.get());
+    hold_buffers_.emplace_back(std::move(cinn_buffer));
+  }
+}
+
 void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
-  const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name);
-
-  const auto& paddle_tensor = cached_scope_->GetVar(var_name)->Get<LoDTensor>();
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name);
-  if (paddle_tensor.IsInitialized()) {
-    CheckTensorEquivalent(var_name, paddle_tensor, cinn_tensor);
-  }
-
-  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign dimensions and alloc/free callback of cinn_buffer_t
-  cinn_buffer->resize(cinn_tensor->shape().data().data(),
-                      cinn_tensor->shape().data().size());
+  auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
       [this, var_name](void* ctx, cinn_buffer_t* buffer) {
         auto* tensor = cached_scope_->GetVar(var_name)->GetMutable<LoDTensor>();
@@ -177,22 +232,14 @@ void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
         // Do nothing
         return 0;
       });
-
-  return AppendArgument(cinn_arg_name, std::move(cinn_buffer));
 }
 
 void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
-  const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name);
-
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name);
-  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign dimensions and alloc/free callback of cinn_buffer_t
-  cinn_buffer->resize(cinn_tensor->shape().data().data(),
-                      cinn_tensor->shape().data().size());
-
+  auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
       [this, var_name](void* ctx, cinn_buffer_t* buffer) {
         auto* tensor =
@@ -212,30 +259,106 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
         tensor->clear();
         return 0;
       });
-  return AppendArgument(cinn_arg_name, std::move(cinn_buffer));
 }
 
-void CinnLaunchContext::AppendArgument(
-    const std::string& arg_name, std::unique_ptr<cinn_buffer_t>&& buffer) {
-  name2argument_.emplace(arg_name, buffer.get());
-  hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << string::Sprintf(
-      "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg_name,
-      framework::DDim(buffer->dims, buffer->dimensions).to_str(),
-      name2argument_.size());
+framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
+    const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) {
+  CinnRuntimeProgram* runtime_program = compiled_obj.runtime_program.get();
+  // Step 0: Create an empty program_desc, there will be only one block
+  framework::ProgramDesc program_desc;
+  auto* block = program_desc.MutableBlock(0);
+  const std::vector<std::unique_ptr<CinnInstruction>>& instructions =
+      runtime_program->GetRunInstructions();
+
+  // build a map that links the name of a Paddle variable to its VarDesc
+  const std::unordered_set<framework::ir::Node*>& nodes = graph.Nodes();
+  std::unordered_map<std::string, framework::VarDesc*> original_vardescs;
+  for (auto* node : nodes) {
+    if (node->IsVar() && node->Var()) {
+      original_vardescs.emplace(node->Name(), node->Var());
+    }
+  }
+
+  // Step 1: Create a VarDesc for each execution argument:
+  //   (1) For those variables that are input or output variables of the
+  //   original subgraph, there must exist an original VarDesc, so
+  //   we copy some useful info(such as IsParameter,Persistable)
+  //   to the new VarDesc.
+  //   (2) For all variables, the shape, data type of their VarDescs
+  //   are set by values of the corresponding compiled tensors,
+  //   including the in/out variables where the equiality between their tensors
+  //   and the CINN compiled ones is verified in corresponding cinn_launch_op.
+  for (auto&& arg : cinn_argument_names_) {
+    const std::string& var_name = cinn2paddle_varmap_.at(arg);
+    framework::VarDesc* var_desc = block->Var(var_name);
+    var_desc->SetType(framework::proto::VarType::LOD_TENSOR);
+
+    auto res = original_vardescs.find(var_name);
+    if (res != original_vardescs.end()) {
+      auto* ori_desc = res->second;
+      var_desc->SetPersistable(ori_desc->Persistable());
+      var_desc->SetIsParameter(ori_desc->IsParameter());
+    }
+
+    auto cinn_tensor = GetCinnTensorOfVar(var_name);
+    // TODO(CtfGo): set the corresponding data type after CINN ready,
+    //              currently set as FP32 in default
+    var_desc->SetDataType(framework::proto::VarType::FP32);
+    var_desc->SetShape(std::vector<int64_t>(cinn_tensor->shape().data().begin(),
+                                            cinn_tensor->shape().data().end()));
+  }
+
+  // transform names of the input or output arguments of a CINN instruction
+  // to the corresponding Paddle variable names, and repack them as one vector
+  auto trans_and_pack_args_fn =
+      [this](const std::vector<std::vector<std::string>>& cinn_args_array) {
+        std::vector<std::string> var_names;
+        for (auto&& cinn_args : cinn_args_array) {
+          for (auto&& arg : cinn_args) {
+            auto res = cinn2paddle_varmap_.find(arg);
+            PADDLE_ENFORCE_NE(
+                res, cinn2paddle_varmap_.end(),
+                platform::errors::NotFound("Argument(%s) not found", arg));
+            var_names.emplace_back(res->second);
+          }
+        }
+        return var_names;
+      };
+
+  // Step 2: create a VarDesc of cinn_instruction_run op for
+  //         each CINN instruction and append it to the main block
+  for (auto ins_idx = 0; ins_idx < instructions.size(); ++ins_idx) {
+    auto* ins = instructions.at(ins_idx).get();
+    auto in_args = trans_and_pack_args_fn(ins->GetInArgs());
+    auto out_args = trans_and_pack_args_fn(ins->GetOutArgs());
+
+    auto* op_desc = block->AppendOp();
+    op_desc->SetType("cinn_instruction_run");
+    op_desc->SetInput(kX, in_args);
+    op_desc->SetOutput(kOutputs, out_args);
+    op_desc->SetAttr(kCachedIndex,
+                     {static_cast<int64_t>(compiled_obj.cached_index)});
+    op_desc->SetAttr(kInstructionIndex, {static_cast<int64_t>(ins_idx)});
+  }
+
+  return program_desc;
 }
 
-const std::map<std::string, cinn_pod_value_t>&
-CinnLaunchContext::FinalizeArguments() const {
-  // Check all execution parameters are assigned valued.
-  std::for_each(cinn_argument_names_.begin(), cinn_argument_names_.end(),
-                [this](const auto& arg_name) {
-                  PADDLE_ENFORCE_GT(
-                      name2argument_.count(arg_name), 0,
-                      platform::errors::NotFound(
-                          "Argument(%s) is missed for execution", arg_name));
-                });
-  return name2argument_;
+ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
+                                                  framework::Scope* scope) {
+  if (!parallel_executor_) {
+    framework::details::ExecutionStrategy exec_strategy;
+    framework::details::BuildStrategy build_strategy;
+    parallel_executor_ = std::make_unique<ParallelExecutor>(
+        place, scope, exec_strategy, build_strategy, runtime_graph_.get());
+  }
+
+  // update the scope bound to an OpHandle and rebuild temporary variables
+  std::unordered_map<Scope*, Scope*> scope_map = {
+      {parallel_executor_->GetLocalScopes().front(), scope}};
+  parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
+  parallel_executor_->PrepareVariables(scope);
+  return parallel_executor_.get();
 }
 
 cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar(
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index 502e6a92dc10bba4a39bef0a493f8c5deb7eeb71..a4d613ea618a886d99344a34ad80aa02e88c10e7 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -21,7 +21,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -35,10 +35,25 @@ class Program;
 }  // namespace cinn::hlir::framework
 
 namespace paddle {
+namespace framework {
+class ProgramDesc;
+class Scope;
+class VarDesc;
+
+namespace ir {
+class Graph;
+}  // namespace ir
+
+namespace paddle2cinn {
+class CinnCompiledObject;
+}  // namespace paddle2cinn
+}  // namespace framework
+
 namespace operators::details {
 
 using CinnTensor = ::cinn::hlir::framework::Tensor;
 using CinnScope = ::cinn::hlir::framework::Scope;
+using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
 
 // This class is used to cache some reusable data among repeated
 // executions for efficiency and it also provides easy interfaces
@@ -49,58 +64,71 @@ using CinnScope = ::cinn::hlir::framework::Scope;
 // Variable while a CINN variable is called an Argument.
 class CinnLaunchContext {
  public:
-  explicit CinnLaunchContext(
-      const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
-      const std::shared_ptr<CinnScope>& cinn_scope);
+  explicit CinnLaunchContext(const framework::ir::Graph& graph,
+                             const CinnCompiledObject& compiled_obj);
+
+  // Initialize a ParallelExecutor to execute the runtime graph,
+  // it will be constructed in the first call, and just update
+  // the execution scope in the following usage.
+  framework::ParallelExecutor* InitializePE(const platform::Place& place,
+                                            framework::Scope* scope);
 
   // explicitly update several environment variables captured
   // by callback of execution arguments
   void UpdateCapturedEnv(const framework::Scope& scope,
                          const platform::Place& place);
 
-  // Return whether execution arguments has been initialized
-  bool IsArgumentsInitialized() const;
-
   // Return whether a Paddle variable used in cinn execution
   bool IsVariableUsed(const std::string& var_name) const;
 
-  // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name);
-
-  // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name);
+  // Check the equiality in type and dimension between the tensor
+  // in Paddle and the compiled tensor returned by CINN of a same variable
+  void CheckTensorEquivalent(const std::string& var_name,
+                             const framework::LoDTensor& paddle_tensor);
 
-  // Extract internal variable names from all applied variables
-  // in execution by excluding the input and output variables
-  std::unordered_set<std::string> ExtractInternalVarNames(
-      const std::vector<std::string>& input_var_names,
-      const std::vector<std::string>& output_var_names);
+  // Return internal variable names list
+  const std::unordered_set<std::string>& GetInternalVarNames() const {
+    return internal_var_names_;
+  }
 
   // Finalize all execution arguments and return the name->argument map
-  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
+  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const {
+    return name2argument_;
+  }
 
   // Return the cinn_buffer_t* of a specific variable
   cinn_buffer_t* GetCinnBufferOfVar(const std::string& var_name);
 
  private:
-  // Get CinnTensor with CINN argument name
-  CinnTensor GetCinnTensor(const std::string& arg_name);
+  // Get corresponding compiled tensor of a Paddle variable name
+  CinnTensor GetCinnTensorOfVar(const std::string& var_name);
+
   // Build the name maps of paddle->cinn and cinn->paddle
   // in reverse for all variables used in cinn execution
   void BuildVarNameMap(
       const std::unordered_map<std::string, std::string>& compiled_varmap,
       const std::unordered_set<std::string>& argument_names);
 
-  // Check whether the tensor in Paddle and the compiled
-  // tensor returned by CINN of a same variable
-  // are equivalent in type and dimension
-  void CheckTensorEquivalent(const std::string& var_name,
-                             const framework::LoDTensor& paddle_tensor,
-                             const CinnTensor& cinn_tensor);
+  // Extract internal variable names from all applied variables
+  // in execution by excluding the input and output variables
+  std::unordered_set<std::string> ExtractInternalVarNames(
+      const std::vector<std::string>& input_var_names,
+      const std::vector<std::string>& output_var_names);
+
+  // Initialize each execution argument with a cinn_buffer_t
+  void InitializeArguments();
 
-  // Append an argument with (cinn name)->(cinn_buffer_t) pair
-  void AppendArgument(const std::string& arg_name,
-                      std::unique_ptr<cinn_buffer_t>&& buffer);
+  // Assign tensor buffer to input or output variables
+  void AssignExternalVariable(const std::string& var_name);
+
+  // Assign tensor buffer to internal variables
+  void AssignInternalVariable(const std::string& var_name);
+
+  // Construct a Paddle ProgramDesc with the CINN runtime
+  // instructions included in the compiled CINN Program
+  framework::ProgramDesc BuildCompiledProgram(
+      const framework::ir::Graph& graph,
+      const CinnCompiledObject& compiled_obj);
 
  private:
   const framework::Scope* cached_scope_ = nullptr;
@@ -111,16 +139,22 @@ class CinnLaunchContext {
   std::unordered_map<std::string, std::string> paddle2cinn_varmap_;
   // a name map from cinn execution arguments to paddle variables
   std::unordered_map<std::string, std::string> cinn2paddle_varmap_;
+  // a list of internal variable names in Paddle
+  std::unordered_set<std::string> internal_var_names_;
   // the names of the cinn arguments used in compiled executable program
   std::unordered_set<std::string> cinn_argument_names_;
   // the variable scope compiled from cinn
   const std::shared_ptr<CinnScope> cinn_scope_;
 
+  // the ir::Graph object converted from the program compiled by CINN
+  std::unique_ptr<framework::ir::Graph> runtime_graph_;
+  // a ParallelExecutor to execute the runtime graph
+  std::unique_ptr<framework::ParallelExecutor> parallel_executor_;
+
   // because a cinn_pod_value_t does not own a cinn_buffer_t object,
   // an extra stroage is necessary to keep those objects and they can
   // not be released until the runtime program finish execution.
   std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers_;
-
   // this map saves all execution arguments with their cinn names as key,
   // and it is passed to the Execute interface of a cinn runtime program.
   std::map<std::string, cinn_pod_value_t> name2argument_;
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index 58a9c5db712b9ae90f7a3bb486266b61e386d591..4976a59d1dd3829b637f18b3c815e4d2fc9c7526 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -13,87 +13,229 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+#include <memory>
+#include <set>
+#include <utility>
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
 #include "cinn/hlir/framework/tensor.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/phi/core/ddim.h"
 
+USE_OP(cinn_instruction_run);
 namespace paddle {
 namespace operators::details {
 
-using LoDTensor = framework::LoDTensor;
+using framework::OpDesc;
+using framework::ProgramDesc;
+using framework::LoDTensor;
+using framework::ir::Graph;
+using framework::ParallelExecutor;
+using framework::paddle2cinn::Name2VarInfoMap;
 using CinnShape = ::cinn::hlir::framework::Shape;
+using CinnInstruction = ::cinn::hlir::framework::Instruction;
+using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
 
-std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
+const Graph& InitDefaultSubgraph() {
   static std::once_flag initialized;
-  static std::unordered_map<std::string, std::string> paddle2cinn_varmap;
-  static std::shared_ptr<CinnScope> cinn_scope;
-  std::call_once(initialized, [&paddle2cinn_varmap, &cinn_scope]() {
-    auto& scope = cinn_scope;
-    scope = std::make_shared<CinnScope>();
+  static std::unique_ptr<Graph> graph;
+  std::call_once(initialized, [&]() {
+    ProgramDesc program;
+    auto* block = program.MutableBlock(0);
+    auto* var1 = block->Var("var1");
+    var1->SetPersistable(true);
+    block->Var("var2");
+    block->Var("var3");
+    block->Var("var4");
+    auto* var5 = block->Var("var5");
+    var5->SetIsParameter(true);
+    auto add_op = std::unique_ptr<OpDesc>(
+        new OpDesc("elementwise_add", {{"X", {"var1"}}, {"Y", {"var2"}}},
+                   {{"Out", {"var3"}}}, {}));
+    block->AppendAllocatedOp(std::move(add_op));
+    auto mul_op = std::unique_ptr<OpDesc>(new OpDesc(
+        "mul", {{"X", {"var1"}}, {"Y", {"var2"}}}, {{"Out", {"var4"}}}, {}));
+    block->AppendAllocatedOp(std::move(mul_op));
+    auto res_op = std::unique_ptr<OpDesc>(
+        new OpDesc("elementwise_add", {{"X", {"var3"}}, {"Y", {"var4"}}},
+                   {{"Out", {"var5"}}}, {}));
+    block->AppendAllocatedOp(std::move(res_op));
+    graph = std::make_unique<Graph>(program);
+
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kInputVars,
+        new std::vector<std::string>({"var1", "var2"}));
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kInternalVars,
+        new std::vector<std::string>({"var3", "var4"}));
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kOutputVars,
+        new std::vector<std::string>({"var5"}));
+    graph->GetOrInit<Name2VarInfoMap>(
+        framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
+  });
+  return *graph.get();
+}
 
+CinnCompiledObject* InitDefaultCompiledObject() {
+  static std::once_flag initialized;
+  static auto compiled_obj = std::make_unique<CinnCompiledObject>();
+  std::call_once(initialized, [result = compiled_obj.get()]() {
+    auto& scope = result->scope;
+    scope = std::make_shared<CinnScope>();
     scope->Var<CinnTensor>("cinn_var1");
     scope->GetTensor("cinn_var1")->Resize(CinnShape({3, 4}));
     scope->Var<CinnTensor>("cinn_var2");
     scope->GetTensor("cinn_var2")->Resize(CinnShape({6, 7, 8}));
     scope->Var<CinnTensor>("cinn_var3");
     scope->GetTensor("cinn_var3")->Resize(CinnShape({10, 16}));
+    scope->Var<CinnTensor>("cinn_var4");
+    scope->GetTensor("cinn_var4")->Resize(CinnShape({10, 16}));
+    scope->Var<CinnTensor>("cinn_var5");
+    scope->GetTensor("cinn_var5")->Resize(CinnShape({10, 16}));
 
-    paddle2cinn_varmap = {
-        {"var1", "cinn_var1"}, {"var3", "cinn_var3"}, {"var4", "cinn_var4"}};
+    // input variables: var1, var2; output: var5
+    // internal variables: var3 and var4, here var3 is retained
+    // in result map, so the name will be used neither cinn_var3
+    auto& paddle2cinn_varmap = result->paddle2cinn_varmap;
+    paddle2cinn_varmap = {{"var1", "cinn_var1"},
+                          {"var2", "cinn_var2"},
+                          {"var3", "cinn_var3"},
+                          {"var5", "cinn_var5"}};
+
+    auto& runtime_program = result->runtime_program;
+    std::vector<std::unique_ptr<CinnInstruction>> instructions;
+    instructions.emplace_back(new CinnInstruction(
+        cinn::common::DefaultHostTarget(), scope.get(),
+        {"cinn_var1", "cinn_var2"}, {"cinn_var3"}, "elementwise_add"));
+    instructions.emplace_back(
+        new CinnInstruction(cinn::common::DefaultHostTarget(), scope.get(),
+                            {"cinn_var1", "cinn_var2"}, {"cinn_var4"}, "mul"));
+    instructions.emplace_back(new CinnInstruction(
+        cinn::common::DefaultHostTarget(), scope.get(),
+        {"cinn_var3", "cinn_var4"}, {"cinn_var5"}, "elementwise_add"));
+    runtime_program =
+        std::make_unique<CinnRuntimeProgram>(scope, std::move(instructions));
+    result->cached_index = 110;
   });
 
-  return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
+  return compiled_obj.get();
 }
 
-TEST(CinnLaunchContextTest, TestBasic) {
-  auto launch_context = CreateDefaultLaunchContext();
-  // test IsVariableUsed
+class CinnLaunchContextTest : public ::testing::Test {
+ public:
+  std::unique_ptr<CinnLaunchContext> launch_context;
+  CinnCompiledObject* compiled_obj;
+
+  void SetUp() override {
+    compiled_obj = InitDefaultCompiledObject();
+    launch_context = std::make_unique<CinnLaunchContext>(InitDefaultSubgraph(),
+                                                         *compiled_obj);
+  }
+};
+
+TEST_F(CinnLaunchContextTest, TestConstructResult) {
   ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
+  ASSERT_EQ(launch_context->IsVariableUsed("var2"), true);
+  ASSERT_EQ(launch_context->IsVariableUsed("var3"), true);
   ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-  // test UpdateCapturedEnv
-  platform::CPUPlace place;
-  framework::Scope scope;
-  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
-  // test IsArgumentsInitialized
-  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
+  ASSERT_EQ(launch_context->IsVariableUsed("var5"), true);
+
+  // check result of ExtractInternalVarNames
+  ASSERT_EQ(launch_context->GetInternalVarNames(),
+            std::unordered_set<std::string>({"var3", "cinn_var4"}));
+
+  // check completeness of arguments list, and also check
+  // the two name maps of the paddle->cinn and the reverse one
+  // through the IsVariableUsed interface
+  auto&& arguments = launch_context->FinalizeArguments();
+  ASSERT_EQ(arguments.size(), 5);
+  auto check_argument_fn = [&arguments, this](const std::string& var_name,
+                                              const std::string& arg_name) {
+    ASSERT_EQ(launch_context->IsVariableUsed(var_name), true);
+    ASSERT_NO_THROW(launch_context->GetCinnBufferOfVar(var_name));
+    ASSERT_GT(arguments.count(arg_name), 0);
+    EXPECT_EQ(launch_context->GetCinnBufferOfVar(var_name),
+              static_cast<cinn_buffer_t*>(arguments.at(arg_name)));
+    auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
+    auto&& scope = compiled_obj->scope;
+    ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
+              phi::make_ddim(scope->GetTensor(arg_name)->shape().data()));
+  };
+  check_argument_fn("var1", "cinn_var1");
+  check_argument_fn("var2", "cinn_var2");
+  check_argument_fn("var3", "cinn_var3");
+  check_argument_fn("cinn_var4", "cinn_var4");
+  check_argument_fn("var5", "cinn_var5");
 }
 
-TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
+TEST_F(CinnLaunchContextTest, TestCheckTensorEquivalent) {
   platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
   launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
 
   // CheckTensorEquivalent: tensor dimension not equivalent
   tensor1->mutable_data<float>(phi::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
+  ASSERT_THROW(launch_context->CheckTensorEquivalent("var1", *tensor1),
                paddle::platform::EnforceNotMet);
 }
 
-TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
+TEST_F(CinnLaunchContextTest, TestBuildCompiledProgram) {
   platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
-  launch_context->UpdateCapturedEnv(scope, place);
-  auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
+  ParallelExecutor* pe = nullptr;
+  ASSERT_NO_THROW((pe = launch_context->InitializePE(place, &scope)));
 
-  // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
-               paddle::platform::EnforceNotMet);
-  // not found
-  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
-               paddle::platform::EnforceNotMet);
+  // check details of program build by compiled instructions
+  const ProgramDesc& program = pe->Graph().OriginProgram();
+  ASSERT_EQ(program.Size(), 1);
+  const auto& block = program.Block(0);
+  // vars
+  std::set<std::string> var_names = block.LocalVarNames();
+  ASSERT_EQ(var_names.size(), 5);
+  for (auto&& var_name : var_names) {
+    auto* var = block.FindVar(var_name);
+    ASSERT_NE(var, nullptr);
+    auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
+    ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
+              phi::make_ddim(var->GetShape()));
+  }
+  ASSERT_TRUE(block.FindVar("var1")->Persistable());
+  ASSERT_FALSE(block.FindVar("var5")->Persistable());
+  ASSERT_TRUE(block.FindVar("var5")->IsParameter());
+  ASSERT_FALSE(block.FindVar("var1")->IsParameter());
+  // ops
+  ASSERT_EQ(block.OpSize(), 3);
+  auto* op1 = block.Op(0);
+  ASSERT_EQ(op1->Type(), "cinn_instruction_run");
+  ASSERT_EQ(op1->Input(kX), std::vector<std::string>({"var1", "var2"}));
+  ASSERT_EQ(op1->Output(kOutputs), std::vector<std::string>({"var3"}));
+  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kCachedIndex), 110);
+  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kInstructionIndex), 0);
+  auto* op3 = block.Op(2);
+  ASSERT_EQ(op3->Type(), "cinn_instruction_run");
+  ASSERT_EQ(op3->Input(kX), std::vector<std::string>({"var3", "cinn_var4"}));
+  ASSERT_EQ(op3->Output(kOutputs), std::vector<std::string>({"var5"}));
+  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kCachedIndex), 110);
+  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kInstructionIndex), 2);
 }
 
-TEST(CinnLaunchContextTest, TestAppendArgument) {
-  platform::CPUPlace cpu_place;
-  platform::Place place(cpu_place);
+// DEPRECATED(CtfGo): following test of callback assignment
+// will be deprecated after we switch to pe
+TEST_F(CinnLaunchContextTest, TestCallbackAssignment) {
+  platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
   launch_context->UpdateCapturedEnv(scope, place);
 
   // assign external variables
@@ -101,33 +243,8 @@ TEST(CinnLaunchContextTest, TestAppendArgument) {
   float* data1 = tensor1->mutable_data<float>(phi::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));
-
-  auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
-  tensor3->mutable_data<float>(phi::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
-
-  // FinalizeArguments missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
-  // test get internal variables
-  auto internal_variable_names =
-      launch_context->ExtractInternalVarNames({"var1"}, {"var3"});
-  ASSERT_EQ(internal_variable_names.size(), 1);
-  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
-
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(phi::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
-
   // check argument is set correctly and alloc/free callbacks work well
-  auto name2argument = launch_context->FinalizeArguments();
-  ASSERT_EQ(name2argument.size(), 3);
-  ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
-
-  auto* cinn_buffer =
-      static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
+  auto* cinn_buffer = launch_context->GetCinnBufferOfVar("var1");
   ASSERT_EQ(cinn_buffer->memory, nullptr);
   cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
   ASSERT_NE(cinn_buffer->memory, nullptr);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 1db9f2f25e270fa61309f3d2e2522b37c73992f4..cf3b98c6679b80acad8da69c91addadb9f66ce44 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -105,63 +105,29 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     auto* launch_context = cinn_compiled_object.launch_context.get();
     // Step 3. Prepare arguments needed for the compiled executable program.
     launch_context->UpdateCapturedEnv(scope, place);
-    if (!launch_context->IsArgumentsInitialized()) {
-      VLOG(4) << "CinnLaunchOp prepare arguments";
-
-      // 3.1 Prepare input variables: tensors of input variables have
-      //     been initialized before graph compiled, just check the
-      //     equiality between tensors of paddle and cinn.
-      for (const auto& var_name : input_no_need_buffer_variable_names) {
-        // the input variable declared as 'no need buffer' can not be used
-        PADDLE_ENFORCE_EQ(
-            launch_context->IsVariableUsed(var_name), false,
-            platform::errors::InvalidArgument(
-                "Input variable(%s) should not be used by cinn in execution",
-                var_name));
-      }
-
-      for (const auto& var_name : input_x_variable_names) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        if (!launch_context->IsVariableUsed(var_name)) {
-          VLOG(4) << "Input variable" << var_name << " not used by cinn";
-          continue;
-        }
-
-        launch_context->AssignExternalVariable(var_name);
-      }
-
-      // 3.2 Prepare output variables: all output variables should
-      //     be initialized and allocated buffer before
-      //     the runtime program start execution, the compilation result
-      //     includes details of their buffer assginment and we use that to
-      //     allocate space in Paddle. For those variables allocated yet,
-      //     like persistable parameters, just check the equiality between
-      //     Paddle allocation and CINN buffer assginment.
-      auto output_variable_names = ctx.OutputNames(kOutputs);
-      for (const auto var_name : output_variable_names) {
-        PADDLE_ENFORCE_EQ(
-            launch_context->IsVariableUsed(var_name), true,
-            platform::errors::InvalidArgument(
-                "Output variable(%s) not used by cinn", var_name));
-
-        launch_context->AssignExternalVariable(var_name);
-      }
-
-      // 3.3 Prepare internal or temporary variables: Create a temporary
-      //     scope to keep internal variables within graph or temporary
-      //     variables needed by the compiled runtime program in addition.
-      //     Here we directly use the names from CinnScope as Paddle variable
-      //     names, because they will not be used outside the graph
-      //     and should be destructed after computation finished.
-      auto internal_variable_names = launch_context->ExtractInternalVarNames(
-          input_x_variable_names, output_variable_names);
-      for (const auto& var_name : internal_variable_names) {
-        launch_context->AssignInternalVariable(var_name);
+    // 3.1 Input variables: tensors of input variables have
+    //     been initialized before graph compiled, just check the
+    //     equiality between tensors of paddle and cinn.
+    for (const auto& var_name : input_x_variable_names) {
+      // some input variables don't need for cinn because they are
+      // eliminated by optimized passes or some cinn operators use
+      // less variables
+      if (!launch_context->IsVariableUsed(var_name)) {
+        VLOG(4) << "Input variable" << var_name << " not used by cinn";
+        continue;
       }
+      launch_context->CheckTensorEquivalent(var_name,
+                                            *inputs_name2tensor.at(var_name));
     }
 
+    // 3.2 Output variables: the output variables will be initialized
+    //     and allocated buffer in callbacks which are defined in the
+    //     external_malloc/free interface of cinn_buffer_t
+    //     in their corresponding arguments.
+    // 3.3 Internal variables: A temporary scope is created in
+    //     UpdateCapturedEnv to keep the internal variables and
+    //     they are also initialized through callbacks
+
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h
index eb3d725d554b1c522cc87d031ee82c1700dc06a0..9720a5309fa6e1ce0316f709d347599fa125f507 100644
--- a/paddle/fluid/operators/cinn/test_helper.h
+++ b/paddle/fluid/operators/cinn/test_helper.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -31,6 +32,7 @@ using LoDTensor = framework::LoDTensor;
 using Variable = framework::Variable;
 using Graph = framework::ir::Graph;
 using Node = framework::ir::Node;
+using framework::paddle2cinn::Name2VarInfoMap;
 
 std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
     const std::string& x_name, const std::string& y_name,
@@ -71,6 +73,16 @@ std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
   y_node->inputs = {feed_op_node_y};
   y_node->outputs = {elementwise_add_node};
   out_node->inputs = {elementwise_add_node};
+  // set necessary attributes
+  g->Set<std::vector<std::string>>(
+      framework::paddle2cinn::kInputVars,
+      new std::vector<std::string>({x_name, y_name}));
+  g->Set<std::vector<std::string>>(framework::paddle2cinn::kInternalVars,
+                                   new std::vector<std::string>({}));
+  g->Set<std::vector<std::string>>(framework::paddle2cinn::kOutputVars,
+                                   new std::vector<std::string>({out_name}));
+  g->GetOrInit<Name2VarInfoMap>(
+      framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
   return g;
 }
 
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 9f27e2238c9c832e62d6de93798b7fab20592a4c..900fd4d8d292e3c4a8884957dceeaa020ee0003e 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -54,7 +54,7 @@ struct FillConstantVisitor {
                  * = nullptr) const {
 #ifdef PADDLE_WITH_ASCEND_CL
     if (platform::is_npu_place(dev_ctx_.GetPlace())) {
-      Tensor tensor_tmp(framework::TransToPtenDataType(dtype_));
+      Tensor tensor_tmp(framework::TransToPhiDataType(dtype_));
       tensor_tmp.mutable_data<T>({1}, context_.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value_));
 
@@ -194,7 +194,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
     void *fused_tensor_ptr =
         fused_tensor->Resize(phi::make_ddim({static_cast<int64_t>(numel)}))
             .mutable_data(context.GetPlace(),
-                          framework::TransToPtenDataType(dtype));
+                          framework::TransToPhiDataType(dtype));
     VLOG(10) << "Fused tensor addr " << fused_tensor_ptr;
 
     // Init the continuous space
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index a04935d43eb2d8ab36749f0d2a35b09552001e7c..7e5120cd2b392b1eb0698727ccebac485193f6d9 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -23,8 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \
+    defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif
 
@@ -45,6 +46,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_ASCEND_CL)
 DECLARE_bool(hccl_check_nan);
 #endif
@@ -398,6 +403,65 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->type()));
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<T>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
+
+    mluStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    cnclReduceOp_t cncl_red_type = cnclSum;
+    switch (red_type) {
+      case kRedSum:
+        cncl_red_type = cnclSum;
+        break;
+
+      case kRedMax:
+        cncl_red_type = cnclMax;
+        break;
+
+      case kRedMin:
+        cncl_red_type = cnclMin;
+        break;
+
+      case kRedProd:
+        cncl_red_type = cnclProd;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
+        sendbuff, recvbuff, numel, dtype, cncl_red_type, comm->comm(), stream));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with MLU."));
+#endif
+  }
+};
+
 class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4879696b3f47032dd30e35b2ffba05af8fa2f609
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(c_allreduce_sum,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index 123fb2aafb524d89901959e57d378838acfdf0af..d315f211709e4f76c2d5c685721961a91c2102fe 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -30,7 +30,8 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     auto x = ctx.Input<framework::LoDTensor>("X");
     auto out = ctx.Output<framework::LoDTensor>("Out");
     int numel = x->numel();
-    cnclDataType_t dtype = platform::ToCNCLDataType(x->type());
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->type()));
 
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 4f1f1ec6512067fbb1e2e5af2bd3ef7cd5af4f9e..b5beb770909b56aed590020ccaaa71f50b96a75d 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -98,8 +99,8 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     const auto& labels_dims = labels->dims();
 
     const int axis = logits_dims.size() - 1;
-    const int N = SizeToAxis(axis, logits_dims);
-    const int D = SizeFromAxis(axis, logits_dims);
+    const int N = phi::funcs::SizeToAxis(axis, logits_dims);
+    const int D = phi::funcs::SizeFromAxis(axis, logits_dims);
 
     Tensor logits_2d, softmax_2d, loss_2d;
     logits_2d.ShareDataWith(*logits).Resize({N, D});
@@ -220,8 +221,8 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     }
     const auto sofrmax_dims = softmax->dims();
     const int axis = sofrmax_dims.size() - 1;
-    const int N = SizeToAxis(axis, sofrmax_dims);
-    const int D = SizeFromAxis(axis, sofrmax_dims);
+    const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims);
+    const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims);
 
     Tensor logit_grad_2d;
     logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
index c7cfd41fa2556873166701c96616323d2b1e40c3..f5399e3215d5822c05ca709d95af47eeab921104 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/softmax_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h
index 2a815ef01e1f7acbfa7f1a3d6ea6808c9877155e..b2173d1b53104a132e721cd3f72f7c6e7ace4af1 100644
--- a/paddle/fluid/operators/conj_op.h
+++ b/paddle/fluid/operators/conj_op.h
@@ -37,7 +37,7 @@ class ConjKernel : public framework::OpKernel<T> {
 
     // call new kernel
     phi::ConjKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *x, out);
   }
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index 30ea323733238cd30e8a4e440e1cab08d90c64f0..0160277dc79af50c555b1257e6ffa216b7b56b62 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/diag_v2_op.h"
 #include <algorithm>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -23,44 +25,6 @@ namespace operators {
 class DiagV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "diag_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diag_v2");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto offset = ctx->Attrs().Get<int>("offset");
-
-    if (x_dims.size() == 1UL) {
-      int64_t size_ = x_dims[0] + std::abs(offset);
-      ctx->SetOutputDim("Out", {size_, size_});
-    } else if (x_dims.size() == 2UL) {
-      int64_t size_ = 0;
-      if (offset >= 0) {
-        // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-        // of `size_` will have unexpected result on Windows Python3.8
-        if (x_dims[0] < x_dims[1] - offset) {
-          size_ = x_dims[0];
-        } else {
-          size_ = x_dims[1] - offset;
-        }
-      } else {
-        // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-        // of `size_` will have unexpected result on Windows Python3.8
-        if (x_dims[0] + offset < x_dims[1]) {
-          size_ = x_dims[0] + offset;
-        } else {
-          size_ = x_dims[1];
-        }
-      }
-      ctx->SetOutputDim("Out", {size_});
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The input tensor X's dimensions of DiagV2Op should be either 1 or "
-          "2, but received %d.",
-          x_dims.size()));
-    }
-  }
 };
 
 class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
@@ -94,59 +58,15 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-template <typename DeviceContext, typename T>
-class DiagV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* x_data = X->data<T>();
-    auto x_dims = X->dims();
-    int offset = context.Attr<int>("offset");
-    auto* out = context.Output<framework::Tensor>("Out");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_dims = out->dims();
-
-    int64_t i;
-    if (x_dims.size() == 1) {
-      float padding_value = context.Attr<float>("padding_value");
-      phi::funcs::SetConstant<DeviceContext, T> set_padding_value;
-      auto& dev_ctx = context.template device_context<DeviceContext>();
-      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
-
-      auto x_length = x_dims[0];
-      const int& x_stride = ComputeStride(0, x_dims);
-
-      auto out_stride_0 = ComputeStride(0, out_dims);
-      auto out_stride_1 = ComputeStride(1, out_dims);
-      out_data +=
-          (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
-
-      for (i = 0; i < x_length; i++) {
-        out_data[i * (out_stride_0 + out_stride_1)] = x_data[i * x_stride];
-      }
-    } else {
-      auto out_length = out_dims[0];
-      const int& x_stride_0 = ComputeStride(0, x_dims);
-      const int& x_stride_1 = ComputeStride(1, x_dims);
-
-      auto out_stride_0 = ComputeStride(0, out_dims);
-      x_data += (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
-      for (i = 0; i < out_length; i++) {
-        out_data[i * out_stride_0] = x_data[i * (x_stride_0 + x_stride_1)];
-      }
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
+                            PT_INFER_META(phi::DiagInferMeta));
+
 REGISTER_OPERATOR(
     diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    diag_v2, ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    DiagInferShapeFunctor);
diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
deleted file mode 100644
index 9b83b68bea159a9688a80e1b71eecaacb917153b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/diag_v2_op.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <tuple>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/diag_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-// Extract the diagonal of a matrix 'x' to a vector 'out'.
-template <typename T>
-__global__ void ExtractDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
-                                      std::ptrdiff_t size,
-                                      const std::ptrdiff_t sumStride,
-                                      const std::ptrdiff_t outStride) {
-  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
-       idx += gridDim.x * blockDim.x) {
-    const std::ptrdiff_t xOffset = start + sumStride * idx;
-    out[outStride * idx] = x[xOffset];
-  }
-}
-
-// Paste a vector 'x' to the diagonal of a matrix 'out'
-template <typename T>
-__global__ void PasteDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
-                                    std::ptrdiff_t x_length,
-                                    const std::ptrdiff_t sumStride,
-                                    const std::ptrdiff_t xStride) {
-  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-       idx < x_length; idx += gridDim.x * blockDim.x) {
-    const std::ptrdiff_t outOffset = start + sumStride * idx;
-    out[outOffset] = x[xStride * idx];
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DiagV2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* x_data = X->data<T>();
-    auto x_dims = X->dims();
-    int offset = context.Attr<int>("offset");
-    auto* out = context.Output<framework::Tensor>("Out");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_dims = out->dims();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    auto GetBlockGridSize = [&dev_ctx](int64_t size) {
-      const int64_t block_size =
-          std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
-      int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
-                                          static_cast<int64_t>(1));
-      const int64_t grid_size =
-          std::min(max_blocks, (size + block_size - 1) / block_size);
-      return std::tuple<int64_t, int64_t>{block_size, grid_size};
-    };
-
-    if (x_dims.size() == 1) {
-      float padding_value = context.Attr<float>("padding_value");
-      phi::funcs::SetConstant<DeviceContext, T> set_padding_value;
-      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
-
-      auto x_length = x_dims[0];
-      auto size = (offset > 0) ? x_length + offset : x_length - offset;
-      const int& x_stride = ComputeStride(0, x_dims);
-      if (size > 0) {
-        const auto& out_stride_0 = ComputeStride(0, out_dims);
-        const auto& out_stride_1 = ComputeStride(1, out_dims);
-        auto start =
-            (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
-
-        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
-
-        PasteDiagonalKernel<
-            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
-                 dev_ctx.stream()>>>(out_data, x_data, start, x_length,
-                                     out_stride_0 + out_stride_1, x_stride);
-      }
-    } else {
-      const int& x_stride_0 = ComputeStride(0, x_dims);
-      const int& x_stride_1 = ComputeStride(1, x_dims);
-
-      int64_t size;
-      if (offset > 0) {
-        size = std::min(x_dims[0], x_dims[1] - offset);
-      } else {
-        size = std::min(x_dims[0] + offset, x_dims[1]);
-      }
-
-      if (size > 0) {
-        auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
-        const auto& out_stride_0 = ComputeStride(0, out_dims);
-
-        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
-
-        ExtractDiagonalKernel<
-            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
-                 dev_ctx.stream()>>>(out_data, x_data, start, size,
-                                     x_stride_0 + x_stride_1, out_stride_0);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    diag_v2, ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index e8c28ebfeb00878c69b0e80aef5aa505630f40e8..7fd0a8eb164752f24f0fed4959b0036e1a400f5e 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -41,9 +41,9 @@ class DotKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(x->place());
 
     // call new kernel
-    phi::DotKernel<T, typename paddle::framework::ConvertToPtenContext<
+    phi::DotKernel<T, typename paddle::framework::ConvertToPhiContext<
                           DeviceContext>::TYPE>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *x, *y, out);
   }
@@ -66,7 +66,7 @@ class DotGradKernel : public framework::OpKernel<T> {
 
     // call new kernel
     phi::DotGradKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *tensor_x, *tensor_y, *tensor_dout, tensor_dx, tensor_dy);
   }
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 21fdf69ac570ac6972173d77194275d629ce436f..2fa956a2e6515e8b6a8e1c463c8ab8d1476f8d90 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
 namespace operators {
@@ -270,32 +271,38 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                                 const Tensor& mask, int64_t size,
                                 Tensor* grad_x, bool is_test = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
-  auto dX = EigenVector<T>::Flatten(*grad_x);
-  auto dY = EigenVector<T>::Flatten(grad_y);
-
-  auto& place = *dev_ctx.eigen_device();
+  auto stream = dev_ctx.stream();
+  MT factor;
   if (is_test) {
     if (dropout_implementation == "upscale_in_train") {
-      dX.device(place) = static_cast<T>(1) * dY;
+      factor = static_cast<MT>(1.0f);
     } else {
-      dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
+      factor = static_cast<MT>(1.0f - dropout_prob);
     }
+    std::vector<const framework::Tensor*> ins = {&grad_y};
+    std::vector<framework::Tensor*> outs = {grad_x};
+    auto functor = phi::funcs::ScaleFunctor<T>(factor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   } else {
-    auto M = EigenVector<uint8_t>::Flatten(mask);
+    std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
+    std::vector<framework::Tensor*> outs = {grad_x};
     if (dropout_implementation == "upscale_in_train") {
       if (dropout_prob == 1.0f) {
-        dX.device(place) = static_cast<T>(0) * dY;
+#ifdef PADDLE_WITH_HIP
+        hipMemset(grad_x->data<T>(), 0, size * sizeof(T));
+#else
+        cudaMemset(grad_x->data<T>(), 0, size * sizeof(T));
+#endif
       } else {
-        auto factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
-        auto stream = dev_ctx.stream();
-        std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
-        std::vector<framework::Tensor*> outs = {grad_x};
-        auto functor = CudaDropoutGradFunctor<T, uint8_t>(factor);
+        factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
         paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-            dev_ctx, ins, &outs, functor);
+            dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
       }
     } else {
-      dX.device(place) = dY * M.cast<T>();
+      factor = static_cast<MT>(1.0f);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+          dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
     }
   }
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
deleted file mode 100644
index 2b55d9fbaf6cba83f722e29f6d5359a1a8884c84..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 1a256f75675784549958c8dbc41684a9746818df..a995877778e4770ea8ae64c051a71b31c1fb1e29 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -13,7 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#ifdef __xpu__
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#else
 #include <algorithm>
 #include <utility>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -21,6 +28,7 @@ limitations under the License. */
 // only can include the headers in paddle/phi/include dirs
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
 #include "paddle/phi/kernels/math_kernel.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -28,7 +36,17 @@ namespace operators {
 template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifdef __xpu__
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    const auto& xpu_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T, kps::AddFunctor<T>, 1>(
+        xpu_ctx, ins, &outs, axis, kps::AddFunctor<T>());
+#else
     auto *x = ctx.Input<framework::LoDTensor>("X");
     auto *y = ctx.Input<framework::LoDTensor>("Y");
     auto *z = ctx.Output<framework::LoDTensor>("Out");
@@ -37,9 +55,10 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.device_context<DeviceContext>();
     int axis = ctx.Attr<int>("axis");
     phi::AddRawKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE &>(dev_ctx),
         *x, *y, axis, z);
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
index a3fea0d7b3dbf91cbe19c299edea3ffee77d3cbe..d6e0749318e901947b46b4b1d6ff8bbdb16bef36 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
@@ -1,14 +1,19 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef PADDLE_WITH_XPU_KP
+
 // Please do not modify the following code
 #if defined(__CUDA_ARCH__)
 #undef __CUDA_ARCH__
@@ -26,163 +31,31 @@ limitations under the License. */
 #undef __NVCC__
 #endif
 
-#ifdef PADDLE_WITH_XPU_KP
 #include <xpu/runtime.h>                // NOLINT
 #include "xpu/kernel/cluster_header.h"  // NOLINT
 #include "xpu/kernel/debug.h"           // NOLINT
 #include "xpu/kernel/math.h"            // NOLINT
 
-#include <memory>
-#include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseAddXPUKPKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    const auto& xpu_ctx =
-        ctx.template device_context<paddle::platform::XPUDeviceContext>();
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T, kps::AddFunctor<T>, 1>(
-        xpu_ctx, ins, &outs, axis, kps::AddFunctor<T>());
-  }
-};
-
-static std::vector<int> get_rdims(const std::vector<int>& xdims,
-                                  const std::vector<int>& ydims) {
-  std::vector<int> rdims;
-  for (size_t i = 0; i < xdims.size(); i++) {
-    if (xdims[i] != ydims[i]) {
-      rdims.push_back(i);
-    }
-  }
-  return rdims;
-}
-
-template <typename T>
-class ElementwiseAddGradXPUKPKernel : public ElemwiseGradKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dz = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    const framework::DDim& x_dims = x->dims();
-    const framework::DDim& y_dims = y->dims();
-    const framework::DDim& dz_dims = dz->dims();
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    PADDLE_ENFORCE_GE(
-        axis, 0,
-        platform::errors::InvalidArgument(
-            "Axis should be great than or equal to 0, but received axis is %d.",
-            axis));
-    PADDLE_ENFORCE_LT(
-        axis, max_dim,
-        platform::errors::InvalidArgument(
-            "Axis should be less than %d, but received axis is %d.", max_dim,
-            axis));
-
-    std::vector<int> x_dims_vec(max_dim, 1);
-    std::vector<int> y_dims_vec(max_dim, 1);
-    std::vector<int> z_dims_vec(max_dim, 1);
-    if (x_dims.size() == max_dim) {
-      for (int i = 0; i < max_dim; i++) {
-        x_dims_vec[i] = x_dims[i];
-      }
-    } else {
-      for (int i = 0; i < x_dims.size(); i++) {
-        x_dims_vec[i + axis] = x_dims[i];
-      }
-    }
-
-    if (y_dims.size() == max_dim) {
-      for (int i = 0; i < max_dim; i++) {
-        y_dims_vec[i] = y_dims[i];
-      }
-    } else {
-      for (int i = 0; i < y_dims.size(); i++) {
-        y_dims_vec[i + axis] = y_dims[i];
-      }
-    }
-
-    for (int i = 0; i < max_dim; i++) {
-      z_dims_vec[i] = dz_dims[i];
-    }
-    std::vector<int> rdims_for_x;
-    std::vector<int> rdims_for_y;
-    rdims_for_x = get_rdims(x_dims_vec, z_dims_vec);
-    rdims_for_y = get_rdims(y_dims_vec, z_dims_vec);
-    const T* dz_data = dz->data<T>();
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::XPUDeviceContext>();
-
-    if (dx != nullptr) {
-      T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-      if (rdims_for_x.size() == 0) {
-        if (dx_data != dz_data) {
-          framework::TensorCopy(
-              *dz, ctx.GetPlace(),
-              ctx.template device_context<platform::DeviceContext>(), dx);
-        }
-      } else {
-        // For inplace strategy, dx will be stored in addr of dz, which makes
-        // the result of dy wrong.
-        if (dx->IsSharedBufferWith(*dz)) {
-          dx->clear();
-          dx->mutable_data<T>(x->dims(), ctx.GetPlace());
-        }
-
-        int ret = xpu::reduce_sum<XPUType>(
-            dev_ctx.x_context(), reinterpret_cast<const XPUType*>(dz_data),
-            reinterpret_cast<XPUType*>(dx_data), z_dims_vec, rdims_for_x);
-        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum ");
-      }
-    }
-
-    if (dy != nullptr) {
-      T* dy_data = dy->mutable_data<T>(ctx.GetPlace());
-      if (rdims_for_y.size() == 0) {
-        if (dy_data != dz_data) {
-          framework::TensorCopy(
-              *dz, ctx.GetPlace(),
-              ctx.template device_context<platform::DeviceContext>(), dy);
-        }
-      } else {
-        int ret = xpu::reduce_sum<XPUType>(
-            dev_ctx.x_context(), reinterpret_cast<const XPUType*>(dz_data),
-            reinterpret_cast<XPUType*>(dy_data), z_dims_vec, rdims_for_y);
-        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum ");
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
+#else
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/phi/kernels/gpu/elementwise.h"
+#endif
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+#ifdef PADDLE_WITH_XPU_KP
 REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
-                   ops::ElementwiseAddXPUKPKernel<float>);
-
-REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace,
-                   ops::ElementwiseAddGradXPUKPKernel<float>);
-
-#endif  // PADDLE_WITH_XPU_KP
+                   ops::ElementwiseAddKernel<plat::XPUDeviceContext, float>);
+#else
+REGISTER_OP_CUDA_KERNEL(
+    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::bfloat16>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 1df43936920a9b7164c72d21619293301446aff6..c58a7f36548a57a1c8e7770fa282470fba4cc140 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -63,11 +63,11 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.device_context<DeviceContext>();
     int axis = ctx.Attr<int>("axis");
-    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
-    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
-    auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
+    auto pt_x = paddle::experimental::MakePhiDenseTensor(*x);
+    auto pt_y = paddle::experimental::MakePhiDenseTensor(*y);
+    auto pt_z = paddle::experimental::MakePhiDenseTensor(*z);
     phi::DivideRawKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *pt_x.get(), *pt_y.get(), axis, pt_z.get());
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 5ff0f29ab43a059fefa165dae5c6388231cc8182..e172279145e28c0731ed0d8d91769d0b293662fe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -167,6 +167,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
+                              paddle::platform::bfloat16>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
                               paddle::platform::complex<float>>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
@@ -178,6 +180,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
+                                  paddle::platform::bfloat16>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
                                   paddle::platform::complex<float>>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
@@ -194,6 +198,8 @@ REGISTER_OP_CPU_KERNEL(
                                         int64_t>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         bool>,
+    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         paddle::platform::complex<float>>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
@@ -210,6 +216,8 @@ REGISTER_OP_CPU_KERNEL(
                                         int64_t>,
     ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
                                         bool>,
+    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
                                         paddle::platform::complex<float>>,
     ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index e2f81d26d5a7bb959353e8390b2b70d824df0b56..45c87a27a180af4798a9f8b31e2edfd0cacb583d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -49,9 +49,9 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       z_lod->mutable_data<T>(ctx.GetPlace());
 
       int axis = ctx.Attr<int>("axis");
-      auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
-      auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
-      auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
+      auto pt_x = paddle::experimental::MakePhiDenseTensor(*x_lod);
+      auto pt_y = paddle::experimental::MakePhiDenseTensor(*y_lod);
+      auto pt_z = paddle::experimental::MakePhiDenseTensor(*z_lod);
       phi::MultiplyRawKernel<T>(static_cast<const phi::GPUContext&>(cuda_ctx),
                                 *pt_x.get(), *pt_y.get(), axis, pt_z.get());
     } else {
@@ -100,6 +100,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, bool>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<float>>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
@@ -110,6 +111,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, bool>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
                                   plat::complex<float>>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
@@ -122,6 +124,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, bool>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
+                                        plat::bfloat16>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
                                         plat::complex<float>>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
@@ -134,6 +138,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, bool>,
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
+                                        plat::bfloat16>,
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
                                         plat::complex<float>>,
     ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 93713be051599966d3b7fc5efa7329247096e0ca..c81266d584468f51030026e1423a649252001f58 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -122,11 +122,11 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
 
       auto& dev_ctx = ctx.device_context<DeviceContext>();
       int axis = ctx.Attr<int>("axis");
-      auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
-      auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
-      auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
+      auto pt_x = paddle::experimental::MakePhiDenseTensor(*x_lod);
+      auto pt_y = paddle::experimental::MakePhiDenseTensor(*y);
+      auto pt_z = paddle::experimental::MakePhiDenseTensor(*z_lod);
       phi::MultiplyRawKernel<T>(
-          static_cast<const typename framework::ConvertToPtenContext<
+          static_cast<const typename framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
           *pt_x.get(), *pt_y.get(), axis, pt_z.get());
     } else {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7505890f41d441cbbf958cda3e86b36343e1b2c
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using MLUDeviceContext = platform::MLUDeviceContext;
+
+static void GetReduceAxes(const int axis, const framework::DDim& src_ddims,
+                          const framework::DDim& target_ddims,
+                          std::vector<int>* axes) {
+  int64_t src_dim_size = src_ddims.size();
+  int64_t target_dim_size = target_ddims.size();
+  for (int64_t i = 0; i < src_dim_size; ++i) {
+    if (i < axis || i >= target_dim_size + axis) {
+      axes->push_back(i);
+      continue;
+    }
+    if (src_ddims[i] > target_ddims[i - axis]) {
+      axes->push_back(i);
+    }
+  }
+}
+
+template <typename T>
+class ElementwiseMulMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+    const auto& x_dims = x->dims();
+    const auto& y_dims = y->dims();
+    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                     : axis);
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                           y_dims_array.data(), out_dims_array.data(), max_dim,
+                           axis);
+
+    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                       CNNL_NOT_PROPAGATE_NAN);
+
+    MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x),
+                      y_desc.get(), GetBasePtr(y), out_desc.get(),
+                      GetBasePtr(out), ToCnnlDataType<T>());
+  }
+};
+
+template <typename T>
+class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    const auto& x_dims = x->dims();
+    const auto& y_dims = y->dims();
+    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                     : axis);
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                           y_dims_array.data(), out_dims_array.data(), max_dim,
+                           axis);
+
+    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      if (dx->dims() == dout->dims()) {
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), y_desc.get(), GetBasePtr(y),
+                          x_desc.get(), GetBasePtr(dx), ToCnnlDataType<T>());
+      } else {
+        Tensor dx_temp(x->dtype());
+        dx_temp.Resize(dout->dims());
+        dx_temp.mutable_data<T>(ctx.GetPlace());
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), y_desc.get(), GetBasePtr(y),
+                          dout_desc.get(), GetBasePtr(&dx_temp),
+                          ToCnnlDataType<T>());
+
+        std::vector<int> reduce_axes;
+        GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dx_desc(*dx);
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(&dx_temp), 0,
+                        nullptr, nullptr, dx_desc.get(), GetBasePtr(dx));
+      }
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      if (dy->dims() == dout->dims()) {
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), x_desc.get(), GetBasePtr(x),
+                          y_desc.get(), GetBasePtr(dy), ToCnnlDataType<T>());
+      } else {
+        Tensor dy_temp(y->dtype());
+        dy_temp.Resize(dout->dims());
+        dy_temp.mutable_data<T>(ctx.GetPlace());
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), x_desc.get(), GetBasePtr(x),
+                          dout_desc.get(), GetBasePtr(&dy_temp),
+                          ToCnnlDataType<T>());
+
+        std::vector<int> reduce_axes;
+        GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dy_desc(*dy);
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(&dy_temp), 0,
+                        nullptr, nullptr, dy_desc.get(), GetBasePtr(dy));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(elementwise_mul, ops::ElementwiseMulMLUKernel<float>,
+                       ops::ElementwiseMulMLUKernel<paddle::platform::float16>,
+                       ops::ElementwiseMulMLUKernel<int>);
+
+REGISTER_OP_MLU_KERNEL(
+    elementwise_mul_grad, ops::ElementwiseMulGradMLUKernel<float>,
+    ops::ElementwiseMulGradMLUKernel<paddle::platform::float16>,
+    ops::ElementwiseMulGradMLUKernel<int>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 52de5f77ed325321513d58530ec37ec0e4a23adc..418779c32e8bc216be1532bf714bc21d91c452aa 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -31,18 +31,18 @@ void LaunchElementwiseCudaKernel(
   std::vector<phi::DenseTensor *> pt_outputs;
   // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary
   // DenseTensor obj
-  // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp
+  // generated by MakePhiDenseTensor can be destroyed when exits loop. *_tmp
   // can be deleted
   // when DenseTensor support copy constructor.
   std::vector<std::unique_ptr<phi::DenseTensor>> pt_inputs_tmp;
   std::vector<std::unique_ptr<phi::DenseTensor>> pt_outputs_tmp;
   for (auto in : ins) {
     pt_inputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*in)));
+        std::move(paddle::experimental::MakePhiDenseTensor(*in)));
   }
   for (auto out : *outs) {
     pt_outputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*out)));
+        std::move(paddle::experimental::MakePhiDenseTensor(*out)));
   }
   for (int i = 0; i < pt_inputs_tmp.size(); i++) {
     pt_inputs.push_back(pt_inputs_tmp[i].get());
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 4a2d92a8c441a9e180c056a19a417be1497c8bae..7d7bb4f26fcf42ec63cd1fab7ec2667a03c8ba4c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 
 // only can include the headers in paddle/top/api dirs
@@ -34,18 +34,18 @@ void LaunchSameDimsElementwiseCudaKernel(
   std::vector<phi::DenseTensor *> pt_outputs;
   // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary
   // DenseTensor obj
-  // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp
+  // generated by MakePhiDenseTensor can be destroyed when exits loop. *_tmp
   // can be deleted
   // when DenseTensor support copy constructor.
   std::vector<std::unique_ptr<phi::DenseTensor>> pt_inputs_tmp;
   std::vector<std::unique_ptr<phi::DenseTensor>> pt_outputs_tmp;
   for (auto in : ins) {
     pt_inputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*in)));
+        std::move(paddle::experimental::MakePhiDenseTensor(*in)));
   }
   for (auto out : *outs) {
     pt_outputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*out)));
+        std::move(paddle::experimental::MakePhiDenseTensor(*out)));
   }
   for (int i = 0; i < pt_inputs_tmp.size(); i++) {
     pt_inputs.push_back(pt_inputs_tmp[i].get());
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 8d9a2159069423cb3b51517016570057232d2c90..b2cef95d1a349d66161db1c3edf7c14bc8a6d058 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -99,6 +99,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int16_t>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
+                              paddle::platform::bfloat16>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
                               paddle::platform::complex<float>>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
@@ -110,6 +112,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int16_t>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
+                                  paddle::platform::bfloat16>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
                                   paddle::platform::complex<float>>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
@@ -126,6 +130,8 @@ REGISTER_OP_CPU_KERNEL(
                                         int>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
+    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         paddle::platform::complex<float>>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 038cefe372fe6c8132066d59eb5abd7d064a5c97..2c962af9877b978f7a6af25635f345c0ae5ffd27 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -22,6 +22,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
                               paddle::platform::float16>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::bfloat16>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
@@ -34,6 +36,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                   paddle::platform::float16>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::bfloat16>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
@@ -51,6 +55,8 @@ REGISTER_OP_CUDA_KERNEL(
                                         int>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         int64_t>,
+    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         paddle::platform::complex<float>>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 87b647f41352f4fd3cc130597f39c12221c7a903..15c547b493ae045c13ab8d6b14a646cb92716a92 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -34,7 +34,7 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.device_context<DeviceContext>();
     int axis = ctx.Attr<int>("axis");
     phi::SubtractRawKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *x, *y, axis, z);
   }
@@ -56,7 +56,7 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
     auto& dev_ctx = ctx.device_context<DeviceContext>();
 
     phi::SubtractGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *x, *y, *dout, axis, dx, dy);
   }
@@ -86,7 +86,7 @@ class ElementwiseSubDoubleGradKernel : public framework::OpKernel<T> {
       ddy_optional = *ddy;
     }
     phi::SubtractDoubleGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *y, ddx_optional, ddy_optional, *dout, axis, ddout);
   }
diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h
index 42c951385a438709569be58507a39230ad77a22d..cb466fffcd7c7358b6e84c18b7895a17b2eaa907 100644
--- a/paddle/fluid/operators/empty_op.h
+++ b/paddle/fluid/operators/empty_op.h
@@ -39,7 +39,7 @@ class EmptyKernel : public framework::OpKernel<T> {
     out_tensor->Resize(shape);
 
     out_tensor->mutable_data(context.GetPlace(),
-                             framework::TransToPtenDataType(dtype));
+                             framework::TransToPhiDataType(dtype));
   }
 };
 
diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc
index 56a6a80b45dff81dcd63efb2db15f2c0c70ab5ee..3d409b4c4f6772bc7b234208e78c5088eeb2fc00 100644
--- a/paddle/fluid/operators/erfinv_op.cc
+++ b/paddle/fluid/operators/erfinv_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/erfinv_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,14 +23,6 @@ namespace operators {
 class ErfinvOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "erfinv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "erfinv");
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class ErfinvOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -78,23 +73,13 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(
     erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker,
     paddle::operators::ErfinvGradMaker<paddle::framework::OpDesc>,
     paddle::operators::ErfinvGradMaker<paddle::imperative::OpBase>,
-    paddle::operators::ErfinvInplaceInferer);
+    paddle::operators::ErfinvInplaceInferer, ErfinvInferShapeFunctor);
 
 REGISTER_OPERATOR(erfinv_grad, paddle::operators::ErfinvGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    erfinv,
-    paddle::operators::ErfinvKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ErfinvKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
-
-REGISTER_OP_CPU_KERNEL(
-    erfinv_grad,
-    paddle::operators::ErfinvGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    paddle::operators::ErfinvGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>);
diff --git a/paddle/fluid/operators/erfinv_op.h b/paddle/fluid/operators/erfinv_op.h
deleted file mode 100644
index 934d0f4a5a7152fbd909082213f2ee7afa22d47f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/erfinv_op.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
-#endif
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-// ndtri(x * 0.5 + 0.5) / sqrt(2)
-template <typename DeviceContext, typename T>
-class ErfinvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    constexpr T half = static_cast<T>(0.5);
-    constexpr T half_sqrt = static_cast<T>(M_SQRT1_2);
-    eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt;
-  }
-};
-
-// sqrt(pi) / 2 * exp(square(out)) * grad
-template <typename DeviceContext, typename T>
-class ErfinvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto out = ctx.Input<framework::Tensor>("Out");
-    auto dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    constexpr T half_sqrt_pi = static_cast<T>(1 / M_2_SQRTPI);
-    eigen_dx.device(place) =
-        half_sqrt_pi * eigen_dout * eigen_out.square().exp();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index 793519b40182114c13e63dd32caaa382d55fa52d..8f8a0f174a79f13f0bee7aa7b425f8c645e15687 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eye_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -82,14 +82,8 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
-                       ops::EyeKernel<CPU, double>,
-                       ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
-                       ops::EyeKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h
deleted file mode 100644
index 4cec5387e82aa1bbd4bdeb8fbc9681b468e1a0f3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eye_op.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct EyeFunctor {
-  EyeFunctor(int64_t num_columns, T* output)
-      : num_columns_(num_columns), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    output_[idx * num_columns_ + idx] = static_cast<T>(1);
-  }
-
-  int64_t num_columns_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-class EyeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto num_rows = ctx.Attr<int64_t>("num_rows");
-    auto num_columns = ctx.Attr<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-
-    auto* out_tensor = ctx.Output<framework::Tensor>("Out");
-    T* out_data = out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    set_zero(dev_ctx, out_tensor, static_cast<T>(0));
-
-    int64_t num_eyes = (std::min)(num_rows, num_columns);
-    platform::ForRange<DeviceContext> for_range(dev_ctx, num_eyes);
-    EyeFunctor<T> functor(num_columns, out_data);
-    for_range(functor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc
index e109e9d02a03af6bc6c5e440745d3cac2349492f..5ee3202af135bf1941639a0fcb9d9c69d0d13f45 100644
--- a/paddle/fluid/operators/eye_op_npu.cc
+++ b/paddle/fluid/operators/eye_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eye_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc
index 2a914ff2ebd33024d80f8d88fde97f70a2f203a7..b02e60210c085bfcedb22fe915de6700575b0a4c 100644
--- a/paddle/fluid/operators/fill_any_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_npu.cc
@@ -54,7 +54,7 @@ class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
         std::isnan(value), false,
         platform::errors::InvalidArgument("The filled value is NaN."));
 
-    Tensor tensor_tmp(framework::TransToPtenDataType(data_type));
+    Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
     tensor_tmp.mutable_data<T>({1}, context.GetPlace());
     FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value));
 
diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc
index 896310cd0918b118db003d784daca87c49c5ab32..ec4ba6e926c41bab8d7ceda20486db39f2d4dabe 100644
--- a/paddle/fluid/operators/fill_any_like_op_xpu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc
@@ -60,9 +60,9 @@ class FillAnyLikeXPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx =
         context.template device_context<paddle::platform::XPUDeviceContext>();
 
-    // call pten kernel
+    // call phi kernel
     phi::FullLikeKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             paddle::platform::XPUDeviceContext>::TYPE&>(dev_ctx),
         *x, value, phi::DataType::UNDEFINED, out);
   }
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
index 353f73cdd6d05374fe5c9b96dbde7b35ee675c1b..de06aeb01e4dda4e8ca4b4e70ca2c3ad6aa4b5dc 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     fill_constant_batch_size_like,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           paddle::platform::float16>,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
                                            float>,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
index 9d1d1eb7c6af523faa187c6aa4dd58cc8e077a29..31471c6b622684ac2134366bd23b8919ba1f93e5 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -63,7 +63,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPtenDataType(data_type));
+                        framework::TransToPhiDataType(data_type));
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
     }
@@ -72,7 +72,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
       out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPtenDataType(data_type));
+                        framework::TransToPhiDataType(data_type));
       functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
     }
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
index 9ce433a214dd5becfdd979e635eb83e75216bbaf..5bba4da14aba8bf2a6172b7e212dfca642f527fc 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -72,13 +72,13 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPtenDataType(data_type));
+                        framework::TransToPhiDataType(data_type));
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
     } else {
       out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPtenDataType(data_type));
-      Tensor tensor_tmp(framework::TransToPtenDataType(data_type));
+                        framework::TransToPhiDataType(data_type));
+      Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
       tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_tmp, value);
 
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index eccc53d8766e25b6f4445699e09f80581a28cf3e..d401b5b82f2b0defd3f2b17ed199d0bd01510859 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -122,7 +122,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
               << ((data_type == framework::proto::VarType::BF16) ? "<bfloat16>"
                                                                  : "<T>");
       tensor->mutable_data(platform::CPUPlace(),
-                           framework::TransToPtenDataType(data_type));
+                           framework::TransToPhiDataType(data_type));
       phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
@@ -130,7 +130,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 1) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       tensor->mutable_data(ctx.GetPlace(),
-                           framework::TransToPtenDataType(data_type));
+                           framework::TransToPhiDataType(data_type));
       phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
@@ -142,7 +142,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 2) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       tensor->mutable_data(platform::CUDAPinnedPlace(),
-                           framework::TransToPtenDataType(data_type));
+                           framework::TransToPhiDataType(data_type));
       phi::funcs::SetConstant<platform::CUDAPinnedDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(platform::CUDAPinnedPlace());
       functor(
@@ -155,7 +155,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 3) {
 #ifdef PADDLE_WITH_XPU
       tensor->mutable_data(ctx.GetPlace(),
-                           framework::TransToPtenDataType(data_type));
+                           framework::TransToPhiDataType(data_type));
       phi::funcs::SetConstant<platform::XPUDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       functor(reinterpret_cast<const platform::XPUDeviceContext &>(dev_ctx),
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index eb684f818fb08b7c27dbf137c6dd189168382064..79018f2a97448a8c6265a969dad37bce77d1b7ee 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -61,7 +61,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
 
     out_var->mutable_data<T>(shape, ctx.GetPlace());
     if (data_type != framework::proto::VarType::BOOL) {
-      Tensor tensor_value(framework::TransToPtenDataType(data_type));
+      Tensor tensor_value(framework::TransToPhiDataType(data_type));
       tensor_value.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_value, value);
       NpuOpRunner runner;
diff --git a/paddle/fluid/operators/fill_op.h b/paddle/fluid/operators/fill_op.h
index c202fa23ca891d459d658cd3eb1b080593c7801d..c5cbffbf5c695ffe9d16a530b4c84db094a72df2 100644
--- a/paddle/fluid/operators/fill_op.h
+++ b/paddle/fluid/operators/fill_op.h
@@ -49,10 +49,10 @@ class FillKernel : public framework::OpKernel<T> {
     out.Resize(phi::make_ddim(ctx.Attr<std::vector<int>>("shape")));
     auto dtype =
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto pten_dtype = framework::TransToPtenDataType(dtype);
+    auto phi_dtype = framework::TransToPhiDataType(dtype);
     platform::CPUPlace cpu;
     auto force_cpu = ctx.Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : ctx.GetPlace(), pten_dtype);
+    out.mutable_data(force_cpu ? cpu : ctx.GetPlace(), phi_dtype);
 
     framework::LoDTensor tensor;
 
@@ -61,7 +61,7 @@ class FillKernel : public framework::OpKernel<T> {
     } else {
       // Always make tensor in CPU memory.
       tensor.Resize(out.dims());
-      tensor.mutable_data(cpu, pten_dtype);
+      tensor.mutable_data(cpu, phi_dtype);
     }
 
     framework::VisitDataType(
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 3605eabfc1d9bb236b14187c611eed0d149f0acc..5ef13b38c8a86e16cefdc97be6934b313fdb7bc4 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -132,9 +132,9 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
     auto &dev_ctx = context.device_context<DeviceContext>();
 
     // call new kernel
-    phi::FlattenKernel<T, typename paddle::framework::ConvertToPtenContext<
+    phi::FlattenKernel<T, typename paddle::framework::ConvertToPhiContext<
                               DeviceContext>::TYPE>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE &>(dev_ctx),
         *in, start_axis, stop_axis, out);
   }
@@ -153,9 +153,9 @@ class FlattenContiguousRangeGradKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.device_context<DeviceContext>();
 
     // call new kernel
-    phi::FlattenGradKernel<T, typename paddle::framework::ConvertToPtenContext<
+    phi::FlattenGradKernel<T, typename paddle::framework::ConvertToPhiContext<
                                   DeviceContext>::TYPE>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             DeviceContext>::TYPE &>(dev_ctx),
         *d_out, *xshape, d_x);
   }
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 31fff4b668d543bfa080a4adce2ca9b6f564012a..020277675797358bf87a58ac108e6eaaddb26ccc 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_impl.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
 namespace operators {
@@ -123,11 +123,11 @@ class FMHARef {
                                                      T, T>(
           dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
 
-      SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *src_mask_out_tensor,
-                                        softmax_axis, softmax_out_tensor);
+      phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *src_mask_out_tensor,
+                                             softmax_axis, softmax_out_tensor);
     } else {
-      SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out_tensor, softmax_axis,
-                                        softmax_out_tensor);
+      phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out_tensor,
+                                             softmax_axis, softmax_out_tensor);
     }
 
     transB = CblasNoTrans;
@@ -251,9 +251,9 @@ class FMHARef {
     }
 
     if (src_mask_tensor != nullptr) {
-      SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, softmax_out_tensor,
-                                         *softmax_out_grad_tensor, softmax_axis,
-                                         src_mask_out_grad_tensor);
+      phi::SoftmaxBackwardCUDAKernelDriver<T>(
+          dev_ctx_, softmax_out_tensor, *softmax_out_grad_tensor, softmax_axis,
+          src_mask_out_grad_tensor);
 
       // recall LaunchElementwiseCudaKernel fw:  src_mask_out = qk_out +
       // src_mask
@@ -272,9 +272,9 @@ class FMHARef {
       }
 
     } else {
-      SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, softmax_out_tensor,
-                                         *softmax_out_grad_tensor, softmax_axis,
-                                         qk_out_grad_tensor);
+      phi::SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, softmax_out_tensor,
+                                              *softmax_out_grad_tensor,
+                                              softmax_axis, qk_out_grad_tensor);
     }
 
     T* qk_out_grad_data = qk_out_grad_tensor->data<T>();
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 962af435b2312cf876c27e005d19f366d965b1fc..13f1c6808aef2e0873c5ce6493514c47710dcf16 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -34,9 +34,9 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
     int input_num = static_cast<int>(ids.size());
 
     framework::Tensor in_ids_(
-        framework::TransToPtenDataType(framework::proto::VarType::INT64)),
+        framework::TransToPhiDataType(framework::proto::VarType::INT64)),
         in_embs_(
-            framework::TransToPtenDataType(framework::proto::VarType::INT64));
+            framework::TransToPhiDataType(framework::proto::VarType::INT64));
     framework::DDim in_dim{input_num};
     int device_id;
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 95bf96073bdd21ddcadd5e447ba38ecb8dd21b83..a227a8e312765b4311314ea884f2c32443924fbc 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/grid_sampler_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
@@ -292,15 +293,12 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
     auto* output_data = output->mutable_data<T>(ctx.GetPlace());
     VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
             << "; " << output->dims()[2] << "; " << output->dims()[3];
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
-        dev_ctx, output, static_cast<T>(0));
     int count = static_cast<int>(n * out_h * out_w);
     auto cu_stream = dev_ctx.stream();
-    int block_size = 512;
-    int grid_size = (count + block_size - 1) / block_size;
-    VLOG(3) << "cuda launch - grid dims: " << grid_size << "; block dims"
-            << block_size;
-    grid_sample_cuda_kernel<T><<<grid_size, block_size, 0, cu_stream>>>(
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, count);
+    grid_sample_cuda_kernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
         count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
         grid->data<T>(), output_data, mode, padding_mode, align_corners);
   }
@@ -467,19 +465,14 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
     if (ctx.HasOutput(framework::GradVarName("Grid"))) {
       auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
       grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
-          ctx.template device_context<paddle::platform::CUDADeviceContext>(),
-          grid_grad, static_cast<T>(0));
     }
 
     int count = static_cast<int>(n * out_h * out_w);
     auto cu_stream = dev_ctx.stream();
-    int block_size = 512;
-    int grid_size = (count + block_size - 1) / block_size;
-    VLOG(3) << "cuda launch grad kernel - grid dims: " << grid_size
-            << "; block dims" << block_size << "; count: " << count;
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, count);
     grid_sampler_cuda_backward_kernel<
-        T><<<grid_size, block_size, 0, cu_stream>>>(
+        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
         count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
         out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
         padding_mode, align_corners);
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc
index 95c6ed6690541e73ed76cf51b78cd0f94c115035..f8f8f3fd789ad61a99bcc17bc073b6cfd099f639 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cc
+++ b/paddle/fluid/operators/gumbel_softmax_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gumbel_softmax_op.h"
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,10 +24,6 @@ class GumbelSoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    return UnaryOpUnchangedInferShapeCheckAxis(ctx);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -71,20 +68,6 @@ Samples from the Gumbel-Softmax distribution and optionally discretizes.
 class GumbelSoftmaxGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "gumbel_softmax_grad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
-                   "Out@GRAD", "gumbel_softmax_grad");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Out"),
-        ctx->GetInputDim(framework::GradVarName("Out")),
-        platform::errors::InvalidArgument("Input(Out) and its gradients "
-                                          "should have the same shape."));
-
-    ctx->SetOutputDim(framework::GradVarName("X"),
-                      ctx->GetInputDim(framework::GradVarName("Out")));
-  }
 };
 
 template <typename T>
@@ -107,17 +90,16 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
+                            PT_INFER_META(phi::GumbelSoftmaxInferMeta));
+DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
+                            GumbelSoftmaxGradInferShapeFunctor,
+                            PT_INFER_META(phi::GumbelSoftmaxGradInferMeta));
+
 REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp,
                   ops::GumbelSoftmaxOpMaker,
                   ops::GumbelSoftmaxGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GumbelSoftmaxGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(gumbel_softmax_grad, ops::GumbelSoftmaxGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    gumbel_softmax,
-    ops::GumbelSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GumbelSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    gumbel_softmax_grad,
-    ops::GumbelSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GumbelSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::GumbelSoftmaxGradOpMaker<paddle::imperative::OpBase>,
+                  GumbelSoftmaxInferShapeFunctor);
+REGISTER_OPERATOR(gumbel_softmax_grad, ops::GumbelSoftmaxGradOp,
+                  GumbelSoftmaxGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu
deleted file mode 100644
index 880e3eb9f3f9a9c68392f5ea9cc5ab9465676a3a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/gumbel_softmax_op.h"
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename K, typename V>
-using KeyValuePair = cub::KeyValuePair<K, V>;
-
-template <typename T>
-struct UniformCUDAGenerator {
-  T min_, max_;
-  unsigned int seed_;
-  unsigned int offset_ = 0;
-  HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed)
-      : min_(min), max_(max), seed_(seed) {}
-  HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed,
-                                  unsigned int offset)
-      : min_(min), max_(max), seed_(seed), offset_(offset) {}
-
-  HOSTDEVICE T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    return dist(rng);
-  }
-};
-
-template <typename T, size_t BlockDim>
-__global__ void OneHotCUDAKernel(const int64_t height, const int64_t width,
-                                 const int64_t size_out_axis, const T init,
-                                 const T* in, T* out) {
-  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int64_t idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / size_out_axis;
-    int w = idx % size_out_axis;
-    cub::ArgMax reducer;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair = reducer(
-          {k, in[h * width * size_out_axis + k * size_out_axis + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      int index = static_cast<int>(kv_pair.key);
-      out[h * width * size_out_axis + index * size_out_axis + w] = 1;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-struct OneHotGenerator<platform::CUDADeviceContext, T> {
-  static void Transform(const platform::CUDADeviceContext& context,
-                        const Tensor& X, Tensor* Out, int axis) {
-    const int size_to_axis = SizeToAxis(axis, X.dims());
-    const int size_from_axis = SizeFromAxis(axis, X.dims());
-    const int size_out_axis = SizeOutAxis(axis, X.dims());
-    constexpr int thread_size = 512;
-    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
-    int64_t height = size_to_axis * size_out_axis;
-    int block_size = height < max_grid_dimx ? height : max_grid_dimx;
-
-    Tensor input_tensor;
-    input_tensor.mutable_data<T>(Out->dims(), platform::CUDAPlace());
-    paddle::framework::TensorCopy(*Out, context.GetPlace(), &input_tensor);
-    phi::funcs::set_constant(context, Out, 0.0);
-    OneHotCUDAKernel<
-        T, thread_size><<<block_size, thread_size, 0, context.stream()>>>(
-        height, size_from_axis / size_out_axis, size_out_axis,
-        std::numeric_limits<T>::lowest(), input_tensor.data<T>(),
-        Out->data<T>());
-  }
-};
-
-template <typename T>
-__global__ void AddGumbelNoiseCUDAKernel(const T* input_data, T* output_data,
-                                         T* noise, const float temperature,
-                                         int64_t n) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  int step = blockDim.x * gridDim.x;
-  for (int64_t i = index; i < n; i += step) {
-    T gumbel_noise = -log(-log(noise[i]));
-    output_data[i] = (gumbel_noise + input_data[i]) / temperature;
-  }
-}
-
-template <typename T>
-struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> {
-  static void Transform(const platform::CUDADeviceContext& context,
-                        const T* input_data, T* output_data, int size_to_axis,
-                        int size_from_axis, const float temperature) {
-    Tensor random_tensor;
-    int64_t size = size_to_axis * size_from_axis;
-    T* random_data =
-        random_tensor.mutable_data<T>({size}, platform::CUDAPlace());
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-
-    // generate gumbel noise
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy()) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(random_data),
-          UniformCUDAGenerator<T>(0.00001, 1, seed_offset.first, gen_offset));
-    } else {
-      const unsigned int seed = std::random_device()();
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(random_data),
-                        UniformCUDAGenerator<T>(0.00001, 1, seed));
-    }
-
-    // add gumbel noise to X
-    const int thread_size = 512;
-    int64_t block_size = (size + thread_size) / thread_size;
-    AddGumbelNoiseCUDAKernel<
-        T><<<block_size, thread_size, 0, context.stream()>>>(
-        input_data, output_data, random_data, temperature, size);
-  }
-};
-
-#endif
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    gumbel_softmax, ops::GumbelSoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::GumbelSoftmaxKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    gumbel_softmax_grad,
-    ops::GumbelSoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::GumbelSoftmaxGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/gumbel_softmax_op.h b/paddle/fluid/operators/gumbel_softmax_op.h
deleted file mode 100644
index daddd13d7be5e3d7c742a0fa4def3b1828eb27ce..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gumbel_softmax_op.h
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D>;
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline int SizeToAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeFromAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeOutAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis + 1; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename DeviceContext, typename T, int64_t Rank>
-struct ArgMaxFunctor {
-  void operator()(const DeviceContext& ctx, const Tensor& in,
-                  Tensor* index_tensor, const int64_t& axis) {
-    auto in_eigen = EigenTensor<T, Rank>::From(in, in.dims());
-    auto index_eigen = EigenTensor<int, Rank - 1>::From(*index_tensor);
-    index_eigen = in_eigen.argmax(axis).template cast<int>();
-  }
-};
-template <typename DeviceContext, typename T>
-struct GumbleNoiseGenerator;
-
-template <typename DeviceContext, typename T>
-struct OneHotGenerator;
-
-template <typename T>
-struct GumbleNoiseGenerator<platform::CPUDeviceContext, T> {
-  static void Transform(const platform::CPUDeviceContext& context,
-                        const T* input_data, T* output_data, int size_to_axis,
-                        int size_from_axis, const float temperature) {
-    // generate uniform random number
-    const int size = size_to_axis * size_from_axis;
-    std::uniform_real_distribution<T> dist(0.00001, 1);
-    auto engine = paddle::framework::GetCPURandomEngine(0);
-    Tensor random_tensor;
-    auto* random_data =
-        random_tensor.mutable_data<T>({size}, platform::CPUPlace());
-    for (int64_t i = 0; i < size; ++i) {
-      random_data[i] = dist(*engine);
-    }
-
-    // generate gumbel noise
-    framework::DDim dim_2d{size_to_axis, size_from_axis};
-    auto gumbel_noise_eigen = EigenMatrix<T>::From(random_tensor, dim_2d);
-    gumbel_noise_eigen = -(((-(gumbel_noise_eigen.log())).log()));
-
-    // add noise
-    for (int64_t i = 0; i < size_to_axis * size_from_axis; i++) {
-      output_data[i] = (input_data[i] + random_data[i]) / temperature;
-    }
-  }
-};
-template <typename T>
-struct OneHotGenerator<platform::CPUDeviceContext, T> {
-  static void Transform(const platform::CPUDeviceContext& context,
-                        const Tensor& X, Tensor* Out, int axis) {
-    Tensor index;
-    std::vector<int> index_dim;
-    const auto rank = X.dims().size();
-    const int size_to_axis = SizeToAxis(axis, X.dims());
-    const int size_from_axis = SizeFromAxis(axis, X.dims());
-    const int size_out_axis = SizeOutAxis(axis, X.dims());
-
-    for (int i = 0; i < X.dims().size(); i++) {
-      if (i != axis) index_dim.push_back(X.dims().Get()[i]);
-    }
-    DDim index_ddim(index_dim.data(), rank - 1);
-    index.Resize(index_ddim);
-    auto* index_data = index.mutable_data<int>(context.GetPlace());
-
-#define CALL_ARG_MINMAX_FUNCTOR(rank)                               \
-  ArgMaxFunctor<platform::CPUDeviceContext, T, rank> functor##rank; \
-  functor##rank(context, *Out, &index, axis);
-    switch (Out->dims().size()) {
-      case 1:
-        CALL_ARG_MINMAX_FUNCTOR(1);
-        break;
-      case 2:
-        CALL_ARG_MINMAX_FUNCTOR(2);
-        break;
-      case 3:
-        CALL_ARG_MINMAX_FUNCTOR(3);
-        break;
-      case 4:
-        CALL_ARG_MINMAX_FUNCTOR(4);
-        break;
-      case 5:
-        CALL_ARG_MINMAX_FUNCTOR(5);
-        break;
-      case 6:
-        CALL_ARG_MINMAX_FUNCTOR(6);
-        break;
-      default:
-        PADDLE_ENFORCE_LE(Out->dims().size(), 6,
-                          platform::errors::InvalidArgument(
-                              "gumbel_softmax operator doesn't supports "
-                              "tensors whose ranks are greater "
-                              "than 6 in CPU mode."));
-        break;
-#undef CALL_ARG_MINMAX_FUNCTOR
-    }
-
-    phi::funcs::set_constant(context, Out, 0.0);
-    for (int i = 0; i < size_to_axis; i++) {
-      for (int j = 0; j < size_out_axis; j++) {
-        *(Out->data<T>() + i * size_from_axis + j +
-          index_data[i * size_out_axis + j] * size_out_axis) = 1.0;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GumbelSoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<Tensor>("X");
-    auto* Out = context.Output<Tensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = X->dims()[axis];
-    const bool is_hard = context.Attr<bool>("hard");
-    const float temperature = context.Attr<float>("temperature");
-    PADDLE_ENFORCE_GT(temperature, 0,
-                      platform::errors::InvalidArgument(
-                          "The temperature must be greater than 0. But "
-                          "received temperature = %f",
-                          temperature));
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-    if (Out->numel() == 0) {
-      return;
-    }
-
-    const int size_to_axis = SizeToAxis(axis, X->dims());
-    const int size_from_axis = SizeFromAxis(axis, X->dims());
-    Tensor X_noise_2d, Out_2d;
-    X_noise_2d.Resize({size_to_axis, size_from_axis});
-    Out_2d.ShareDataWith(*Out).Resize({size_to_axis, size_from_axis});
-
-    // generate gumbel noise and add it to X
-    auto* x_noise_data = X_noise_2d.mutable_data<T>(context.GetPlace());
-    GumbleNoiseGenerator<DeviceContext, T>::Transform(
-        context.template device_context<DeviceContext>(), X->data<T>(),
-        x_noise_data, size_to_axis, size_from_axis, temperature);
-
-#ifdef PADDLE_ON_INFERENCE
-    math::SoftmaxFunctor<DeviceContext, T, true>()(
-        context.template device_context<DeviceContext>(), axis_dim, &X_noise_2d,
-        &Out_2d);
-#else
-    math::SoftmaxFunctor<DeviceContext, T, false>()(
-        context.template device_context<DeviceContext>(), axis_dim, &X_noise_2d,
-        &Out_2d);
-#endif
-
-    if (is_hard) {
-      OneHotGenerator<DeviceContext, T>::Transform(
-          context.template device_context<DeviceContext>(), *X, Out, axis);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GumbelSoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<Tensor>("Out");
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
-    const int rank = dX->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = dX->dims()[axis];
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-    if (dX->numel() == 0) {
-      return;
-    }
-
-    const int size_to_axis = SizeToAxis(axis, dX->dims());
-    const int size_from_axis = SizeFromAxis(axis, dX->dims());
-    Tensor dX_2d, Out_2d, dOut_2d;
-    dX_2d.ShareDataWith(*dX).Resize({size_to_axis, size_from_axis});
-    Out_2d.ShareDataWith(*Out).Resize({size_to_axis, size_from_axis});
-    dOut_2d.ShareDataWith(*dOut).Resize({size_to_axis, size_from_axis});
-    math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), axis_dim, &Out_2d,
-        &dOut_2d, &dX_2d);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index c572870d950a8200dc3398e5e1e5c5ab28d2332b..105d818e197434c4ed85126228e06d45bf06e498 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/increment_op.h"
-
-#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace framework {
@@ -39,18 +40,6 @@ class IncrementOp : public framework::OperatorWithKernel {
               const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("X")), 1UL,
-                      platform::errors::InvalidArgument(
-                          "The number of elements in Input(X) should be 1."
-                          "Now the number is %d.",
-                          phi::product(ctx->GetInputDim("X"))));
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "increment");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "increment");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -98,17 +87,9 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
+                            PT_INFER_META(phi::IncrementInferMeta));
 REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
                   ops::IncrementGradOpMaker<paddle::framework::OpDesc>,
-                  ops::IncrementGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
+                  ops::IncrementGradOpMaker<paddle::imperative::OpBase>,
+                  IncrementInferShapeFunctor);
diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h
deleted file mode 100644
index 4b9d07146484ff00ba105b9971f40f91dd8148de..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/increment_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IncrementKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_tensor = context.Input<framework::Tensor>("X");
-    auto* out_tensor = context.Output<framework::Tensor>("Out");
-    float step = context.Attr<float>("step");
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenAdd<std::decay_t<decltype(dev)>, T>::Eval(
-        dev, framework::EigenScalar<T>::From(*out_tensor),
-        framework::EigenScalar<T>::From(*x_tensor), static_cast<T>(step));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
index 1c7c8a19110bc8e9e39b95478e4f06ff0eb50ef9..16f1b3b1269952b11f611e6c6988ed3199977994 100644
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/increment_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
index 3d6a5e0ea88a28addaf09d90cae9659cbea85305..2e3e6569ef5a88f8dfcb6646974b70bcc6c0c95f 100644
--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -31,24 +31,24 @@ namespace operators {
 
 namespace kps = phi::kps;
 template <typename T, typename Functor, int VecSize>
-__global__ void VectorizedIndexKernel(T *out, int numel, int main_offset,
+__global__ void VectorizedIndexKernel(T *out, size_t numel, size_t main_offset,
                                       Functor func) {
-  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
-  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
-  int args[VecSize];
+  size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  size_t args[VecSize];
   T result[VecSize];
   for (; data_offset < main_offset; data_offset += stride) {
-    kps::InitWithDataIndex<int, VecSize, 1, 1>(&args[0], data_offset);
-    kps::ElementwiseUnary<int, T, VecSize, 1, 1, Functor>(&result[0], &args[0],
-                                                          func);
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(&result[0],
+                                                             &args[0], func);
     kps::WriteData<T, VecSize, 1, 1, false>(out + data_offset, &result[0],
                                             BLOCK_NUM_X * VecSize);
   }
-  int num = numel - data_offset;
+  size_t num = numel - data_offset;
   if (num > 0) {
-    kps::InitWithDataIndex<int, VecSize, 1, 1>(&args[0], data_offset);
-    kps::ElementwiseUnary<int, T, VecSize, 1, 1, Functor>(&result[0], &args[0],
-                                                          func);
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(&result[0],
+                                                             &args[0], func);
     kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
   }
 }
@@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
   int numel = out->numel();
   T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
   if (numel <= 0) return;
-  int vec_size = paddle::platform::GetVectorizedSize((out->data<T>()));
+  int vec_size = paddle::platform::GetVectorizedSize(out_data);
 #ifdef PADDLE_WITH_XPU_KP
   int block = 64;
   int grid = 8;
@@ -70,8 +70,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
   int block = config.thread_per_block.x;
   auto stream = dev_ctx.stream();
 #endif
-
-  int main_offset = (numel / (vec_size * block)) * vec_size * block;
+  size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
   switch (vec_size) {
     case 4:
       VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
index d6e2b3ecff8c83e47a9016cc3d233d1aa03fb52b..0e69b397e04c7eda7f515350caf870be5d7b57a5 100644
--- a/paddle/fluid/operators/log_softmax_op.cc
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -31,9 +31,17 @@ class LogSoftmaxOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
 
@@ -48,6 +56,10 @@ class LogSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The dimension index of Input(x) to perform log_softmax,"
                  "default -1 for last dimension")
         .SetDefault(-1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false)
+        .AsExtra();
     AddComment(R"DOC(
 LogSoftmax Operator.
 
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index 49f4ff3107026000726738a640d635739023bc62..f323e2e041d994eb01c9d4e934984b8a005ffcec 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -88,8 +88,8 @@ void SetValueCompute(const framework::ExecutionContext& ctx,
   // set_value is what we want.
   paddle::framework::TensorCopy(*in, place, out);
 
-  Tensor slice_tensor(framework::TransToPtenDataType(dtype)),
-      pad_tensor(framework::TransToPtenDataType(dtype));
+  Tensor slice_tensor(framework::TransToPhiDataType(dtype)),
+      pad_tensor(framework::TransToPhiDataType(dtype));
   slice_tensor.mutable_data<T>(slice_dims, place);
   pad_tensor.mutable_data<T>(in_dims, place);
 
@@ -147,7 +147,7 @@ void SetValueCompute(const framework::ExecutionContext& ctx,
     ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
         ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
   } else {
-    Tensor value_t(framework::TransToPtenDataType(dtype));
+    Tensor value_t(framework::TransToPhiDataType(dtype));
     auto value_dims = phi::make_ddim(shape);
     CheckIsDimsMatch(slice_dims_for_assign, value_dims);
 
@@ -224,8 +224,8 @@ void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   phi::AddRawKernel<
-      T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
-      static_cast<const typename paddle::framework::ConvertToPtenContext<
+      T, typename paddle::framework::ConvertToPhiContext<DeviceContext>::TYPE>(
+      static_cast<const typename paddle::framework::ConvertToPhiContext<
           DeviceContext>::TYPE&>(dev_ctx),
       src1, src2, -1, out);
 }
@@ -237,8 +237,8 @@ void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   phi::SubtractRawKernel<
-      T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
-      static_cast<const typename paddle::framework::ConvertToPtenContext<
+      T, typename paddle::framework::ConvertToPhiContext<DeviceContext>::TYPE>(
+      static_cast<const typename paddle::framework::ConvertToPhiContext<
           DeviceContext>::TYPE&>(dev_ctx),
       src1, src2, -1, out);
 }
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index c6405f65ee3dd21bf0c993bff93f491b706b32f4..a2e34d98461e0107f27d51d3ce7a618c34ca7ea3 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -26,6 +26,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -246,8 +247,8 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     const auto& labels_dims = labels->dims();
 
     const int axis = logits_dims.size() - 1;
-    const int N = SizeToAxis(axis, logits_dims);
-    const int D = SizeFromAxis(axis, logits_dims);
+    const int N = phi::funcs::SizeToAxis(axis, logits_dims);
+    const int D = phi::funcs::SizeFromAxis(axis, logits_dims);
 
     int blocks = NumBlocks(N);
     int threads = kNumCUDAThreads;
@@ -401,8 +402,8 @@ class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 
     const auto sofrmax_dims = softmax->dims();
     const int axis = sofrmax_dims.size() - 1;
-    const int N = SizeToAxis(axis, sofrmax_dims);
-    const int D = SizeFromAxis(axis, sofrmax_dims);
+    const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims);
+    const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims);
 
     if (return_softmax) {
       framework::TensorCopy(*softmax, context.GetPlace(),
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.h b/paddle/fluid/operators/margin_cross_entropy_op.h
index fe0dab5d47d35a56e1806ecb2c47e9cfc8197cd0..9261c84c8552c3eb6b441a28324859970eb0a0b3 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.h
+++ b/paddle/fluid/operators/margin_cross_entropy_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/softmax_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index d7d1093b9b3bf2f9f605c7c45c6d5f8a4e52bb6a..ac6566a87030d4c9cf613134cfe85c379fea5e20 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -42,6 +42,7 @@ endif()
 math_library(fc DEPS blas jit_kernel_helper)
 math_library(matrix_bit_code)
 
+
 math_library(unpooling)
 math_library(vol2col)
 math_library(prelu)
diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
index fa2018178f44ff4e3b14937c1f508fa8a698e20e..c855cb763a97b24222c77f064f80fbc2a50e1f9f 100644
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,6 +27,13 @@ template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
 
+template class SoftmaxFunctor<phi::CPUContext, float, true>;
+template class SoftmaxFunctor<phi::CPUContext, float, false>;
+template class SoftmaxFunctor<phi::CPUContext, double, true>;
+template class SoftmaxFunctor<phi::CPUContext, double, false>;
+template class SoftmaxGradFunctor<phi::CPUContext, float>;
+template class SoftmaxGradFunctor<phi::CPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 692a077f1050fff064e89ef4fad2633972af2d9c..fd879e9e6ffe72a2175acc2db98727f5ff39fbbb 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -139,6 +140,16 @@ template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,
                                   platform::float16>;
 
+template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
+template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
+template class SoftmaxFunctor<phi::GPUContext, float, false>;
+template class SoftmaxFunctor<phi::GPUContext, double, false>;
+template class SoftmaxFunctor<phi::GPUContext, float, true>;
+template class SoftmaxFunctor<phi::GPUContext, double, true>;
+template class SoftmaxGradFunctor<phi::GPUContext, float>;
+template class SoftmaxGradFunctor<phi::GPUContext, double>;
+template class SoftmaxGradFunctor<phi::GPUContext, platform::float16>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
index 1f87513bb4bea0208bc8de945aa56ffed198ab61..2598d3b0277c94a52e1fa14b04c00b595071f312 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -35,8 +35,8 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     }
 
     // cast `indices` or `label` if their type is not INT32
-    Tensor indices_int32(framework::TransToPtenDataType(VT::INT32));
-    Tensor label_int32(framework::TransToPtenDataType(VT::INT32));
+    Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
+    Tensor label_int32(framework::TransToPhiDataType(VT::INT32));
     auto indices_type = framework::TransToProtoVarType(indices->type());
     if (indices_type != VT::INT32) {
       PADDLE_ENFORCE_EQ(MLUSupportsCast(indices_type, VT::INT32), true,
@@ -78,7 +78,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     // equal
     MLUCnnlTensorDesc indices_int32_desc(indices_int32);
     MLUCnnlTensorDesc label_int32_desc(label_int32);
-    Tensor equal_tensor(framework::TransToPtenDataType(VT::BOOL));
+    Tensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
     equal_tensor.Resize(indices->dims());
     equal_tensor.mutable_data<bool>(ctx.GetPlace());
     MLUCnnlTensorDesc equal_tensor_desc(equal_tensor);
@@ -88,7 +88,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                    GetBasePtr(&equal_tensor));
 
     // cast equal
-    Tensor equal_fp32(framework::TransToPtenDataType(VT::FP32));
+    Tensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
     equal_fp32.Resize(indices->dims());
     equal_fp32.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc equal_fp32_desc(equal_fp32);
@@ -99,7 +99,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
 
     // [correct]
     // reduce_max
-    Tensor correct_max(framework::TransToPtenDataType(VT::FP32));
+    Tensor correct_max(framework::TransToPhiDataType(VT::FP32));
     correct_max.Resize(phi::make_ddim({num_samples}));
     correct_max.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc correct_max_desc(correct_max);
@@ -112,7 +112,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                     correct_max_desc.get(), GetBasePtr(&correct_max));
 
     // reduce_sum
-    Tensor correct_sum(framework::TransToPtenDataType(VT::FP32));
+    Tensor correct_sum(framework::TransToPhiDataType(VT::FP32));
     correct_sum.Resize(correct->dims());
     correct_sum.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc correct_sum_desc(correct_sum);
@@ -138,7 +138,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     MLUCnnl::Fill(ctx, num_samples, total_desc.get(), GetBasePtr(total));
 
     // use `total` of type `float32` for calculating accuracy
-    Tensor total_fp32(framework::TransToPtenDataType(VT::FP32));
+    Tensor total_fp32(framework::TransToPhiDataType(VT::FP32));
     total_fp32.Resize(total->dims());
     total_fp32.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc total_fp32_desc(total_fp32);
diff --git a/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..626d3ef40b16655af7b45092cf388a92d62c349d
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+class LogSoftmaxMKLDNNHandler
+    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward> {
+ public:
+  LogSoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine,
+                          platform::Place cpu_place, const Tensor* x,
+                          const int axis)
+      : platform::MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward>(
+            mkldnn_engine, cpu_place) {
+    const auto logsoftmax_tz = phi::vectorize(x->dims());
+    const auto md = dnnl::memory::desc(
+        logsoftmax_tz, platform::MKLDNNGetDataType<T>(), x->format());
+
+    this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_inference,
+                                            md, axis);
+  }
+};
+
+template <typename T>
+class LogSoftmaxMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    Tensor* out = ctx.Output<Tensor>("Out");
+
+    int axis = ctx.Attr<int>("axis");
+    axis = axis >= 0 ? axis : x->dims().size() + axis;
+
+    LogSoftmaxMKLDNNHandler<T> handler(mkldnn_engine, ctx.GetPlace(), x, axis);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto dst_memory_p = handler.AcquireDstMemory(out);
+
+    auto logsoftmax_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    logsoftmax_p->execute(astream, {{DNNL_ARG_SRC, *src_memory_p},
+                                    {DNNL_ARG_DST, *dst_memory_p}});
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(x->format());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(log_softmax, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::LogSoftmaxMKLDNNKernel<float>,
+                   ops::LogSoftmaxMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 2effcbf9f46dd52d9f2dde03a08bcda7a5247e1a..a0e50aa297851b1c6129b169e01b6fa43c1c326c 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -70,7 +71,8 @@ class SoftmaxMKLDNNHandler
                           out_grad->dims(), in_x_grad->dims()));
 
     auto dims = out_grad->dims();  // input and output share the same shape
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
+    const int axis =
+        phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
     auto softmax_tz = phi::vectorize<int64_t>(dims);
 
     auto data_softmax_md = MKLDNNMemDesc(
@@ -96,7 +98,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     Tensor* output = ctx.Output<Tensor>("Out");
     bool is_inplaced = input->IsSharedBufferWith(*output);
 
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), input->dims().size());
+    const int axis =
+        phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), input->dims().size());
 
     SoftmaxMKLDNNHandler<T> handler(mkldnn_engine, ctx.GetPlace(), input,
                                     output, axis);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index 9c5bad86278ed0f47ebb5ebd4ede19b714ec8120..2fdeecf89346fcf15f38b291ed5af49b8a2c8fc0 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -31,7 +31,7 @@ USE_OP(elementwise_mul);
 USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 USE_OP(conv2d);
 USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index 92c58ae0a77679147f76c5113e07f300f4cf2ba2..c776cf2a7c792c429fcf45a367d3f06bf9add5d2 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -29,7 +29,7 @@ USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 056e0690c01fdb1e7a9726db6905c05c7dc1eb54..2cbecba9fa081970221242555b6b805ff9acae83 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -85,7 +85,7 @@ inline cnnlDataType_t ToCnnlDataType(
 
 inline cnnlDataType_t ToCnnlDataType(
     const paddle::framework::proto::VarType::Type& type) {
-  return ToCnnlDataType(framework::TransToPtenDataType(type));
+  return ToCnnlDataType(framework::TransToPhiDataType(type));
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc
index 02479222747df9824cf8d0eacddd89a74a8ea28e..1143f9cb37aa54bea430d3a8bca8b62b02da4e2b 100644
--- a/paddle/fluid/operators/multinomial_op.cc
+++ b/paddle/fluid/operators/multinomial_op.cc
@@ -11,16 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/multinomial_op.h"
 
 #include <algorithm>
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -46,61 +46,6 @@ This OP returns a Tensor filled with the sampled categoris according to Multinom
 class MultinomialOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Multinomial");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multinomial");
-
-    auto x_dim = ctx->GetInputDim("X");
-    int64_t x_rank = x_dim.size();
-    PADDLE_ENFORCE_GT(x_rank, 0,
-                      platform::errors::InvalidArgument(
-                          "The number of dimensions of the input probability "
-                          "distribution should be > 0, but got %d.",
-                          x_rank));
-    PADDLE_ENFORCE_LE(x_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "The number of dimensions of the input probability "
-                          "distribution should be <= 2, but got %d.",
-                          x_rank));
-
-    std::vector<int64_t> out_dims(x_rank);
-    for (int64_t i = 0; i < x_rank - 1; i++) {
-      out_dims[i] = x_dim[i];
-    }
-
-    int64_t num_samples = ctx->Attrs().Get<int>("num_samples");
-    PADDLE_ENFORCE_GT(
-        num_samples, 0,
-        platform::errors::InvalidArgument(
-            "The number of samples should be > 0, but got %d.", num_samples));
-    out_dims[x_rank - 1] = num_samples;
-
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-  }
-};
-
-template <typename T>
-class MultinomialOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    const int64_t num_samples = ctx.Attr<int>("num_samples");
-    const bool replacement = ctx.Attr<bool>("replacement");
-
-    auto *in_data = x->data<T>();
-    int64_t *out_data = out->mutable_data<int64_t>(ctx.GetPlace());
-
-    auto in_dims = x->dims();
-    int64_t in_rank = in_dims.size();
-    const int64_t num_categories = in_dims[in_rank - 1];
-    const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
-
-    MultinomialFunctor<T>(out_data, in_data, num_samples, replacement,
-                          num_categories, num_distributions);
-  }
 };
 
 }  // namespace operators
@@ -108,11 +53,10 @@ class MultinomialOpKernel<platform::CPUDeviceContext, T>
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
+                            PT_INFER_META(phi::MultinomialInferMeta));
 REGISTER_OPERATOR(
     multinomial, ops::MultinomialOp, ops::MultinomialOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    multinomial, ops::MultinomialOpKernel<plat::CPUDeviceContext, float>,
-    ops::MultinomialOpKernel<plat::CPUDeviceContext, double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    MultinomialInferShapeFunctor);
diff --git a/paddle/fluid/operators/multinomial_op.cu b/paddle/fluid/operators/multinomial_op.cu
deleted file mode 100644
index a07cae8d3dabc98d22ff2423a605915e8260a802..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/multinomial_op.cu
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// To-do(qili93): fix this after issue resolved
-// https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202
-
-#include <thrust/execution_policy.h>
-#include <thrust/random.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/multinomial_op.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void NormalizeProbability(T* norm_probs, const T* in_data,
-                                     T* sum_rows, int64_t num_distributions,
-                                     int64_t num_categories) {
-  int id = threadIdx.x + blockIdx.x * blockDim.x +
-           blockIdx.y * gridDim.x * blockDim.x;
-  if (id < num_distributions * num_categories) {
-    PADDLE_ENFORCE(
-        in_data[id] >= 0.0,
-        "The input of multinomial distribution should be >= 0, but got %f.",
-        in_data[id]);
-    int64_t row_id = id / num_categories;
-    PADDLE_ENFORCE(sum_rows[row_id] > 0.0,
-                   "The sum of one multinomial distribution probability should "
-                   "be > 0, but got %f.",
-                   sum_rows[row_id]);
-    norm_probs[id] = in_data[id] / sum_rows[row_id];
-  }
-}
-
-template <typename T>
-__global__ void GetCumulativeProbs(T* norm_probs_data,
-                                   int64_t num_distributions,
-                                   int64_t num_categories,
-                                   T* cumulative_probs) {
-  int id = blockIdx.x;
-  thrust::inclusive_scan(thrust::device, norm_probs_data + id * num_categories,
-                         norm_probs_data + (id + 1) * num_categories,
-                         cumulative_probs + id * num_categories);
-}
-
-template <typename T>
-struct RandomGeneratorCudaFunctor {
-  unsigned int seed_;
-  __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
-    rng.discard(n);
-    return dist(rng);
-  }
-};
-
-template <typename T>
-__device__ int binarySearchFunctor(T* cumulative_probs, T* norm_probs_data,
-                                   int num_categories, T rng_number) {
-  int left = 0;
-  int right = num_categories;
-
-  while (right - left > 0) {
-    int mid = left + (right - left) / 2;
-
-    T temp_prob = cumulative_probs[mid];
-    if (temp_prob < rng_number) {
-      left = mid + 1;
-    } else {
-      right = mid;
-    }
-  }
-
-  if (left == num_categories) {
-    left = num_categories - 1;
-  }
-
-  while (left >= 1 && norm_probs_data[left] == 0) left--;
-
-  return left;
-}
-
-template <typename T>
-__global__ void sampleMultinomialWithReplacement(
-    T* rng_data, const int64_t num_samples, int64_t* out_data,
-    const int64_t num_distributions, const int64_t num_categories,
-    T* cumulative_probs, T* norm_probs_data) {
-  // use binary search to get the selected category sample id.
-  // let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
-
-  // for every distribution
-  int dist = blockIdx.y;
-  // for every sample
-  int sample = blockIdx.x * blockDim.x + threadIdx.x;
-  if (sample < num_samples) {
-    T rng_number = rng_data[sample + dist * num_samples];
-
-    // Find the bucket that a uniform random number lies in
-    int selected_category = binarySearchFunctor<T>(
-        cumulative_probs + dist * num_categories,
-        norm_probs_data + dist * num_categories, num_categories, rng_number);
-
-    out_data[sample + dist * num_samples] = selected_category;
-  }
-}
-
-template <typename T>
-class MultinomialOpKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    const int64_t num_samples = ctx.Attr<int>("num_samples");
-    const bool replacement = ctx.Attr<bool>("replacement");
-
-    auto* in_data = x->data<T>();
-    int64_t* out_data = out->mutable_data<int64_t>(ctx.GetPlace());
-
-    auto in_dims = x->dims();
-    int64_t in_rank = in_dims.size();
-    const int64_t num_categories = in_dims[in_rank - 1];
-    const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
-
-    // If replacement is False, it's not a replaceable sample. Every category
-    // can
-    // be used only once. So after every sample, probability of the distribution
-    // will change. The implementation can't be parallelizable. Thus, call CPU
-    // implementation ``MultinomialFunctor`` to sample the distribution.
-    if (!replacement) {
-      int64_t in_data_numel = x->numel();
-      int64_t out_data_numel = out->numel();
-
-      T* cpu_in_data = new T[in_data_numel];
-      int64_t* cpu_out_data = new int64_t[out_data_numel];
-
-#ifdef PADDLE_WITH_HIP
-      hipMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T),
-                hipMemcpyDeviceToHost);
-#else
-      cudaMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T),
-                 cudaMemcpyDeviceToHost);
-#endif
-
-      MultinomialFunctor<T>(cpu_out_data, cpu_in_data, num_samples, replacement,
-                            num_categories, num_distributions);
-
-#ifdef PADDLE_WITH_HIP
-      hipMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(int64_t),
-                hipMemcpyHostToDevice);
-#else
-      cudaMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(int64_t),
-                 cudaMemcpyHostToDevice);
-#endif
-
-      delete[] cpu_in_data;
-      delete[] cpu_out_data;
-      return;
-    }
-
-    // Sum of input may not be 1. To get probability in range [0, 1], calculate
-    // sum of each row of input, and then use the sum to normalize the input.
-    // sum_row_data: sum of each row
-    framework::Tensor sum_rows_tensor;
-    auto* sum_rows_data =
-        sum_rows_tensor.mutable_data<T>({num_distributions}, ctx.GetPlace());
-
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-
-    if (num_distributions == 1) {
-      auto eigen_input = framework::EigenVector<T>::Flatten(*x);
-      auto eigen_sum_rows = framework::EigenVector<T>::Flatten(sum_rows_tensor);
-      eigen_sum_rows.device(place) =
-          eigen_input.sum(Eigen::DSizes<int, 1>(1))
-              .eval()
-              .reshape(Eigen::DSizes<int, 1>(sum_rows_tensor.dims()[0]));
-    } else {
-      auto eigen_input = framework::EigenMatrix<T>::From(*x);
-      auto eigen_sum_rows = framework::EigenVector<T>::Flatten(sum_rows_tensor);
-      eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes<int, 1>(1));
-    }
-
-    // Normalize row of each distribution to get the probability in range [0,
-    // 1].
-    // norm_probs_data: probability of the distribution
-    framework::Tensor norm_probs_tensor;
-    auto* norm_probs_data = norm_probs_tensor.mutable_data<T>(
-        {num_distributions, num_categories}, ctx.GetPlace());
-
-    // number of threads in a block is min(num_categories, 512)
-    dim3 block_norm(num_categories < 512 ? num_categories : 512);
-    dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
-    NormalizeProbability<
-        T><<<grid_norm, block_norm, 0, ctx.cuda_device_context().stream()>>>(
-        norm_probs_data, in_data, sum_rows_data, num_distributions,
-        num_categories);
-
-    // Get cumulative probability of each distribution. It's the same function
-    // of
-    // ``cumsum`` op.
-    framework::Tensor cumulative_probs_tensor;
-    auto* cumulative_probs = cumulative_probs_tensor.mutable_data<T>(
-        {num_distributions, num_categories}, ctx.GetPlace());
-    dim3 block_cumsum(1);
-    dim3 grid_cumsum(num_distributions);
-    GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0,
-                            ctx.cuda_device_context().stream()>>>(
-        norm_probs_data, num_distributions, num_categories, cumulative_probs);
-
-    // Generate random number for each sample.
-    std::random_device rd;
-    auto seed = rd();
-
-    framework::Tensor rng_data_tensor;
-    auto* rng_data = rng_data_tensor.mutable_data<T>(
-        {num_distributions, num_samples}, ctx.GetPlace());
-
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    platform::Transform<platform::CUDADeviceContext> trans;
-    auto* context =
-        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
-    trans(*context, index_sequence_begin,
-          index_sequence_begin + num_distributions * num_samples, rng_data,
-          RandomGeneratorCudaFunctor<T>(seed));
-
-    // Sample the multinomial distributions.
-    dim3 block_sample(128);
-    dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions);
-    sampleMultinomialWithReplacement<T><<<grid_sample, block_sample, 0,
-                                          ctx.cuda_device_context().stream()>>>(
-        rng_data, num_samples, out_data, num_distributions, num_categories,
-        cumulative_probs, norm_probs_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    multinomial, ops::MultinomialOpKernel<plat::CUDADeviceContext, double>,
-    ops::MultinomialOpKernel<plat::CUDADeviceContext, float>);
-
-#endif
diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
index 01135bab6d1d26337eb1bc53867506eae906eea5..ab9f10070fc60deab8974ae0e81e2b4c6cef2ffd 100644
--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -12,7 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/mv_op.h"
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace operators {
 
@@ -116,10 +122,3 @@ REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
                   ops::MVOpGradMaker<paddle::framework::OpDesc>,
                   ops::MVOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(mv_grad, ops::MVOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    mv, ops::MVKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MVKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    mv_grad, ops::MVGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MVGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu
deleted file mode 100644
index b8b61ae49047216b94bbaa35a120b551e1aea91b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mv_op.cu
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mv_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout,
-                                   const T *vec, T *dx) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < m * n; idx += blockDim.x * gridDim.x) {
-    int i = idx / n;
-    int j = idx % n;
-    dx[idx] = dout[i] * vec[j];
-  }
-}
-
-// Using dimensional constraints on matrix multiplication, it is
-// straight-forward to check the following table for when X and Y
-// are both matrices.
-//
-// dX = | dOut Vec^T
-// dVec = | X^T dOut
-template <typename T>
-class MVGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<framework::Tensor>("X");
-    auto *vec = context.Input<framework::Tensor>("Vec");
-    auto *dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *dvec =
-        context.Output<framework::Tensor>(framework::GradVarName("Vec"));
-
-    auto dim_x = x->dims();
-    int m = dim_x[0];
-    int n = dim_x[1];
-
-    // get data ptr
-    const T *x_data = x->data<T>();
-    const T *vec_data = vec->data<T>();
-    const T *dout_data = dout->data<T>();
-
-    auto &dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
-    auto stream = context.cuda_device_context().stream();
-    auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
-
-    if (dx) {
-      T *dx_data = dx->mutable_data<T>(context.GetPlace());
-
-      MVGradDxCUDAKernel<
-          T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-          m, n, dout_data, vec_data, dx_data);
-    }
-
-    if (dvec) {
-      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
-      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-                static_cast<T>(0), dvec_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    mv, ops::MVKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MVKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    mv_grad, ops::MVGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MVGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mv_op.h b/paddle/fluid/operators/mv_op.h
deleted file mode 100644
index c0a2172af3677220ff2816bf0f9b7d8ade0d8ba1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mv_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class MVKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<framework::Tensor>("X");
-    auto *vec = context.Input<framework::Tensor>("Vec");
-
-    auto *out = context.Output<framework::Tensor>("Out");
-
-    auto dim_x = x->dims();
-
-    // get data ptr
-    const T *x_data = x->data<T>();
-    const T *vec_data = vec->data<T>();
-    T *out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    blas.GEMV(false, dim_x[0], dim_x[1], static_cast<T>(1), x_data, vec_data,
-              static_cast<T>(0), out_data);
-  }
-};
-
-// Using dimensional constraints on matrix multiplication, it is
-// straight-forward to check the following table for when X and Y
-// are both matrices.
-//
-// dX = | dOut vec^T
-// dVec = | X^T dOut
-template <typename DeviceContext, typename T>
-class MVGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<framework::Tensor>("X");
-    auto *vec = context.Input<framework::Tensor>("Vec");
-    auto *dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *dvec =
-        context.Output<framework::Tensor>(framework::GradVarName("Vec"));
-
-    auto dim_x = x->dims();
-    int m = dim_x[0];
-    int n = dim_x[1];
-
-    // get data ptr
-    const T *x_data = x->data<T>();
-    const T *vec_data = vec->data<T>();
-    const T *dout_data = dout->data<T>();
-
-    if (dx) {
-      T *dx_data = dx->mutable_data<T>(context.GetPlace());
-
-      for (int i = 0; i < m; ++i) {
-        for (int j = 0; j < n; ++j) {
-          dx_data[i * n + j] = dout_data[i] * vec_data[j];
-        }
-      }
-    }
-
-    if (dvec) {
-      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
-      auto &dev_ctx = context.template device_context<DeviceContext>();
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-                static_cast<T>(0), dvec_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
index 3bb605d7f553ea8a72f0c716d4732ac59e984951..3445e9b658becda84aa678e9c1f03b3436d63b70 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -284,6 +284,16 @@ static void CopyVectorToTensor(const std::vector<T> &src,
   memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream);
 }
 
+template <typename T>
+static void CopyVectorToCPUTensor(const std::vector<T> &src,
+                                  framework::Tensor *dst) {
+  dst->Resize({static_cast<int64_t>(src.size())});
+  T *dst_ptr = dst->mutable_data<T>(platform::CPUPlace());
+  const T *src_ptr = src.data();
+  auto nbytes = src.size() * sizeof(T);
+  std::memcpy(dst_ptr, src_ptr, nbytes);
+}
+
 template <typename T>
 class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -677,14 +687,14 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
                                            lengths.back());
     }
 
-    CopyVectorToTensor(
+    CopyVectorToCPUTensor(numel_offsets,
+                          ctx.Output<framework::Tensor>("FusedParamOffsets"));
+    CopyVectorToCPUTensor(
         fp32_partial_numel_offsets,
-        ctx.Output<framework::Tensor>("FP32ShardFusedParamOffsets"), place,
-        stream);
-    CopyVectorToTensor(
+        ctx.Output<framework::Tensor>("FP32ShardFusedParamOffsets"));
+    CopyVectorToCPUTensor(
         fp16_partial_numel_offsets,
-        ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"), place,
-        stream);
+        ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));
 
     // Fill the weight decay tensor
     PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(),
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index 748f8206adbc7fec0b14ca5a72206004951f682c..e5b27446eb330aeb08e134332a5366c6c6ed2908 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -33,12 +33,7 @@ class DistributedFusedLambOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "ParamInfo") {
-      return expected_kernel_type;
-    } else {
-      return framework::OperatorWithKernel::GetKernelTypeForVar(
-          var_name, tensor, expected_kernel_type);
-    }
+    return expected_kernel_type;
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index aeecea8a8e0c155eaf58f6e846c3d681dbc94c93..3f90140f77282983f42ef03f736c35960239dd75 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -14,8 +14,10 @@
 
 #include <cmath>
 #include "paddle/fluid/memory/buffer.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/cast_with_ptr.h"
 #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
+#include "paddle/fluid/operators/optimizers/multi_tensor_apply.h"
 #include "paddle/fluid/operators/tensor_to_string.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/collective_helper.h"
@@ -40,6 +42,163 @@ namespace operators {
 template <typename T>
 using MasterT = typename details::MPTypeTrait<T>::Type;
 
+template <typename T>
+static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
+  static_assert(!std::is_same<T, void>::value, "T cannot be void.");
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream));
+#endif
+}
+
+template <typename T, int BlockDim, int VecSize>
+struct L2NormFunctor {
+  DEVICE void operator()(int tensor_id, int chunk_id, int offset, int size,
+                         const T *x, MasterT<T> *y, int max_chunk_num) const {
+    using MT = MasterT<T>;
+    const T *ptr = x + offset;
+
+    using BlockReduce = cub::BlockReduce<MT, BlockDim>;
+    __shared__ typename BlockReduce::TempStorage storage;
+
+    MT square_sum = static_cast<MT>(0);
+    int i;
+    for (i = threadIdx.x * VecSize; i + VecSize <= size;
+         i += (BlockDim * VecSize)) {
+      platform::AlignedVector<T, VecSize> tmp_vec;
+      platform::Load(ptr + i, &tmp_vec);
+#pragma unroll
+      for (int j = 0; j < VecSize; ++j) {
+        auto tmp = static_cast<MT>(tmp_vec[j]);
+        square_sum += (tmp * tmp);
+      }
+    }
+
+    for (; i < size; ++i) {
+      auto tmp = static_cast<MT>(ptr[i]);
+      square_sum += (tmp * tmp);
+    }
+
+    square_sum = BlockReduce(storage).Reduce(square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      y[tensor_id * max_chunk_num + chunk_id] = square_sum;
+    }
+  }
+};
+
+template <typename InT, typename OutT, int BlockDim, bool NeedSqrt>
+static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
+    const InT *x, OutT *y, int max_chunk_num) {
+  int tensor_id = blockIdx.x;
+  x += (tensor_id * max_chunk_num);
+  using BlockReduce = cub::BlockReduce<InT, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage storage;
+  InT sum = static_cast<InT>(0);
+  for (int i = threadIdx.x; i < max_chunk_num; i += BlockDim) {
+    sum += x[i];
+  }
+  sum = BlockReduce(storage).Reduce(sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    if (NeedSqrt) {
+      y[blockIdx.x] = static_cast<OutT>(sqrtf(sum));
+    } else {
+      y[blockIdx.x] = static_cast<OutT>(sum);
+    }
+  }
+}
+
+template <typename T>
+static int GetChunkedVecSize(const T *ptr, int chunk_size) {
+  static_assert(!std::is_same<T, void>::value, "T cannot be void.");
+
+  constexpr int max_load_bits = 128;
+  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
+  auto address = reinterpret_cast<uintptr_t>(ptr);
+  constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
+  constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
+  constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
+  if (address % vec8 == 0 && chunk_size % vec8 == 0) {
+    return std::min(8, valid_vec_size);
+  } else if (address % vec4 == 0 && chunk_size % vec4 == 0) {
+    return std::min(4, valid_vec_size);
+  } else if (address % vec2 == 0 && chunk_size % vec2 == 0) {
+    return std::min(2, valid_vec_size);
+  } else {
+    return 1;
+  }
+}
+
+#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \
+  case __vec_size: {                                    \
+    constexpr int kVecSize = __vec_size;                \
+    __VA_ARGS__;                                        \
+    break;                                              \
+  }
+
+#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...)    \
+  do {                                                \
+    switch (__vec_size) {                             \
+      PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \
+      PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \
+      PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \
+      PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \
+    }                                                 \
+  } while (0)
+
+// TODO(zengjinle): which chunk_size is better?
+template <typename InT, typename OutT, bool NeedSqrt = false,
+          int MaxTensorNumPerLaunch = 50, int MaxChunkNumPerLaunch = 680,
+          int BlockDim = 512>
+static void MultiTensorL2Norm(const platform::CUDAPlace &place,
+                              gpuStream_t stream, const InT *x,
+                              const int *offsets, int n, OutT *y,
+                              int chunk_size = 65536) {
+  if (n <= 0) return;
+
+  constexpr int kNumTensor = MaxTensorNumPerLaunch;
+  constexpr int kNumChunk = MaxChunkNumPerLaunch;
+  constexpr int kBlockDim = BlockDim;
+
+  int max_chunk_num = -1;
+  int vec_size = 8;
+  int total_chunk_num = 0;
+  for (int i = 0; i < n; ++i) {
+    vec_size = std::min(
+        vec_size, GetChunkedVecSize(x + offsets[i] - offsets[0], chunk_size));
+    int length = offsets[i + 1] - offsets[i];
+    auto tmp_chunk_num = (length + chunk_size - 1) / chunk_size;
+    max_chunk_num = std::max(max_chunk_num, tmp_chunk_num);
+    total_chunk_num += tmp_chunk_num;
+  }
+
+  VLOG(1) << "MultiTensorL2Norm max_chunk_num = " << max_chunk_num
+          << " , total_chunk_num = " << total_chunk_num
+          << " , tensor_num = " << n;
+
+  using MT = MasterT<InT>;
+  memory::Buffer tmp_out(place);
+  auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
+  FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);
+
+#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL                         \
+  do {                                                              \
+    using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>;       \
+    VLOG(10) << __func__ << " " << typeid(InT).name()               \
+             << " VecSize = " << kVecSize;                          \
+    MultiTensorApply<FunctorT, kBlockDim, kNumTensor, kNumChunk>(   \
+        FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \
+        max_chunk_num);                                             \
+  } while (0)
+
+  PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL);
+#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL
+
+  MultiTensorL2NormReduceAgainCUDAKernel<MT, OutT, kBlockDim,
+                                         NeedSqrt><<<n, kBlockDim, 0, stream>>>(
+      tmp_out_ptr, y, max_chunk_num);
+}
+
 template <int LogLevel>
 static void LogParamAndTrustRatioDivSquareNorm(
     const framework::ExecutionContext &ctx, const float *param_square_norm,
@@ -620,76 +779,6 @@ static void CubDeviceReduce(InputIteratorT d_in, OutputIteratorT d_out,
                                 num_items, reduction_op, init, stream));
 }
 
-template <typename InputIteratorT, typename OutputIteratorT,
-          typename OffsetIteratorT, typename ReductionOp, typename T>
-static void CubDeviceSegmentedReduce(InputIteratorT d_in, OutputIteratorT d_out,
-                                     int num_segments,
-                                     OffsetIteratorT d_begin_offsets,
-                                     OffsetIteratorT d_end_offsets,
-                                     ReductionOp reduction_op, T initial_value,
-                                     gpuStream_t stream,
-                                     memory::Buffer *buffer) {
-  void *d_temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedReduce::Reduce(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments,
-      d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream));
-  d_temp_storage = buffer->Alloc<void>(temp_storage_bytes);
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedReduce::Reduce(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments,
-      d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream));
-}
-
-template <typename T>
-struct AddConstantFunctor {
-  explicit AddConstantFunctor(T bias) : bias_(bias) {}
-
-  T operator()(T x) const { return x + bias_; }
-
- private:
-  T bias_;
-};
-
-template <typename T>
-struct OffsetWithBiasFunctor {
-  OffsetWithBiasFunctor(const T *offset, T bias)
-      : offset_(offset), bias_(bias) {}
-
-  HOSTDEVICE T operator()(T idx) const { return offset_[idx] - bias_; }
-
-  HOSTDEVICE constexpr bool operator==(const OffsetWithBiasFunctor<T> &) const {
-    return true;
-  }
-
- private:
-  const T *offset_;
-  const T bias_;
-};
-
-template <typename T, typename OffsetT>
-static void CubDeviceSegmentedSquareNorm(const T *x, MasterT<T> *y, int n,
-                                         const OffsetT *offset,
-                                         OffsetT init_offset,
-                                         gpuStream_t stream,
-                                         memory::Buffer *buffer) {
-  if (n <= 0) return;
-  cub::TransformInputIterator<MasterT<T>, SquareFunctor<T>, const T *> iter(
-      x, SquareFunctor<T>());
-  if (init_offset == static_cast<OffsetT>(0)) {
-    CubDeviceSegmentedReduce(iter, y, n, offset, offset + 1, cub::Sum(),
-                             static_cast<MasterT<T>>(0), stream, buffer);
-  } else {
-    cub::CountingInputIterator<OffsetT> cnt_iter(0);
-    OffsetWithBiasFunctor<OffsetT> functor(offset, init_offset);
-    cub::TransformInputIterator<OffsetT, OffsetWithBiasFunctor<OffsetT>,
-                                cub::CountingInputIterator<OffsetT>>
-        offset_iter(cnt_iter, functor);
-    CubDeviceSegmentedReduce(iter, y, n, offset_iter, offset_iter + 1,
-                             cub::Sum(), static_cast<MasterT<T>>(0), stream,
-                             buffer);
-  }
-}
-
 template <typename T>
 static void GetSquareGradNormImpl(const T *grad, int n, float *square_norm,
                                   gpuStream_t stream,
@@ -862,16 +951,6 @@ static void CheckHasNanInfGrad(const float *fp32_grad, int fp32_numel,
   }
 }
 
-template <typename T>
-static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
-  static_assert(!std::is_same<T, void>::value, "T cannot be void.");
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream));
-#endif
-}
-
 template <typename T>
 class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -1191,13 +1270,16 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         fp16_partial_fused_offsets_t->data<int>();
 
     VLOG(1) << "FusedParamOffsets: "
-            << FlattenToString(fused_offsets, fused_offsets_t->numel(), place);
+            << FlattenToString(fused_offsets, fused_offsets_t->numel(),
+                               fused_offsets_t->place());
     VLOG(1) << "FP32ShardFusedParamOffsets: "
             << FlattenToString(fp32_partial_fused_offsets,
-                               fp32_partial_fused_offsets_t->numel(), place);
+                               fp32_partial_fused_offsets_t->numel(),
+                               fp32_partial_fused_offsets_t->place());
     VLOG(1) << "FP16ShardFusedParamOffsets: "
             << FlattenToString(fp16_partial_fused_offsets,
-                               fp16_partial_fused_offsets_t->numel(), place);
+                               fp16_partial_fused_offsets_t->numel(),
+                               fp16_partial_fused_offsets_t->place());
 
     if (num_devices > 1) {
       if (use_master_param_norm) {
@@ -1207,32 +1289,26 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         FillZeroWithPtr(trust_ratio_div_square_norm, param_num, stream);
       }
     }
-    CubDeviceSegmentedSquareNorm(fp32_param, param_square_norm,
-                                 fp32_global_param_num, fused_offsets, 0,
-                                 stream, &cub_tmp_buffer);
+    MultiTensorL2Norm(place, stream, fp32_param, fused_offsets,
+                      fp32_global_param_num, param_square_norm);
     if (use_master_param_norm) {
-      CubDeviceSegmentedSquareNorm(
-          master_param + fp16_offset, param_square_norm + fp16_local_start_idx,
-          fp16_local_param_num, fp16_partial_fused_offsets, 0, stream,
-          &cub_tmp_buffer);
+      MultiTensorL2Norm(place, stream, master_param + fp16_offset,
+                        fp16_partial_fused_offsets, fp16_local_param_num,
+                        param_square_norm + fp16_local_start_idx);
     } else {
       // NOTE: extra computation is performed. We can improve this performance
       // if needed in the future.
-      CubDeviceSegmentedSquareNorm(
-          fp16_param, param_square_norm + fp32_global_param_num,
-          fp16_global_param_num, fused_offsets + fp32_global_param_num,
-          static_cast<int>(fp32_numel), stream, &cub_tmp_buffer);
+      MultiTensorL2Norm(
+          place, stream, fp16_param, fused_offsets + fp32_global_param_num,
+          fp16_global_param_num, param_square_norm + fp32_global_param_num);
     }
 
-    CubDeviceSegmentedSquareNorm(
-        trust_ratio_div, trust_ratio_div_square_norm + fp32_local_start_idx,
-        fp32_local_param_num, fp32_partial_fused_offsets, 0, stream,
-        &cub_tmp_buffer);
-    CubDeviceSegmentedSquareNorm(
-        trust_ratio_div + fp32_numel_each_device,
-        trust_ratio_div_square_norm + fp16_local_start_idx,
-        fp16_local_param_num, fp16_partial_fused_offsets, 0, stream,
-        &cub_tmp_buffer);
+    MultiTensorL2Norm(place, stream, trust_ratio_div,
+                      fp32_partial_fused_offsets, fp32_local_param_num,
+                      trust_ratio_div_square_norm + fp32_local_start_idx);
+    MultiTensorL2Norm(place, stream, trust_ratio_div + fp32_numel_each_device,
+                      fp16_partial_fused_offsets, fp16_local_param_num,
+                      trust_ratio_div_square_norm + fp16_local_start_idx);
 
     VLOG(1) << "TrustRatioDiv L2-Norm before allreduce: "
             << FlattenToString(trust_ratio_div_square_norm, param_num, place);
diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d8d03c733dae210e8a41a8ad78a258df558b341
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "math.h"  // NOLINT
+
+namespace paddle {
+namespace operators {
+
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch>
+struct TensorMetaList {
+  static constexpr int kTensorNum = MaxTensorNumPerLaunch;
+  static constexpr int kChunkNum = MaxChunkNumPerLaunch;
+
+  static_assert(kTensorNum > 0 && kTensorNum < 256,
+                "kTensorNum must be inside (0, 256).");
+  static_assert(kChunkNum > 0 && kChunkNum < 65536,
+                "kChunkNum must be inside (0, 65536).");
+
+  /**
+   * The tensor numel offset of each tensor.
+   * The offsets[0] would be always 0 in the first launch,
+   * and then offsets[0] >= 0 in the following other launches.
+   * The numel of the i-th tensor would be offsets[i + 1] - offsets[i].
+   */
+  int offsets[kTensorNum + 1];
+
+  /**
+   * The tensor id of each chunk. The tensor_ids[0] is always 0.
+   * Note that tensor_ids would be always in the ascending order.
+   * The actual tensor id is start_tensor_id + tensor_ids[i].
+   *
+   * The reason why we assume that the actual tensor id is
+   * start_tensor_id + tensor_ids[i] is to make tensor_ids to be
+   * a uint8_t array instead of an int array, making sizeof(TensorMetaList)
+   * smaller, so that kChunkNum can be larger.
+   */
+  uint8_t tensor_ids[kChunkNum];
+
+  /**
+   * The chunk id of the chunk inside each tensor. It would be
+   * something like chunk_ids = [0, 1, 2, 0, 0, 1, 2, 3], meaning
+   * that there are 3 tensors and each tensor contains 3, 1 and 4
+   * chunks. Note that chunk_ids[0] is always 0 and the actual
+   * chunk id of the first tensor is always start_chunk_id + chunk_ids[i].
+   *
+   * The reason why we assume that the actual chunk id of the first
+   * tensor is always start_chunk_id + chunk_ids[i] is to make
+   * chunk_ids to be a uint16_t array instead of an int array, making
+   * sizeof(TensorMetaList) smaller, so that kChunkNum can be larger.
+   */
+  uint16_t chunk_ids[kChunkNum];
+
+  /**
+   * The tensor_ids offset.
+   */
+  int start_tensor_id;
+
+  /**
+   * The chunk_ids offset.
+   */
+  int start_chunk_id;
+};
+
+template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename... Args>
+static __global__ void MultiTensorApplyCUDAKernel(
+    Functor functor,
+    TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> meta,
+    int chunk_size, Args... args) {
+  const int block_id = blockIdx.x;
+  const int tensor_id = meta.tensor_ids[block_id];
+  const int chunk_id = static_cast<int>(meta.chunk_ids[block_id]) +
+                       (tensor_id == 0) * meta.start_chunk_id;
+  const int prev_offset = meta.offsets[tensor_id];
+  const int next_offset = meta.offsets[tensor_id + 1];
+  const int ptr_offset = prev_offset + chunk_id * chunk_size;
+  const int size = min(next_offset - ptr_offset, chunk_size);
+
+  functor(tensor_id + meta.start_tensor_id, chunk_id, ptr_offset, size,
+          args...);
+}
+
+template <typename Functor, int BlockDim, int MaxTensorNumPerLaunch,
+          int MaxChunkNumPerLaunch, typename... Args>
+static void MultiTensorApply(Functor functor, gpuStream_t stream,
+                             const int *offsets, int n, int chunk_size,
+                             Args... args) {
+  if (n == 0) return;
+
+  constexpr auto NumTensor = MaxTensorNumPerLaunch;
+  constexpr auto NumChunk = MaxChunkNumPerLaunch;
+  TensorMetaList<NumTensor, NumChunk> metas;
+
+  int tensor_id = 0;
+  int chunk_id = 0;
+  int numel_offset = 0;
+  metas.start_tensor_id = 0;
+  metas.start_chunk_id = 0;
+  for (int i = 0; i < n; ++i) {
+    auto length = offsets[i + 1] - offsets[i];
+    if (tensor_id == 0) {
+      metas.start_tensor_id = i;
+      metas.offsets[0] = numel_offset;
+    }
+    metas.offsets[tensor_id + 1] = metas.offsets[tensor_id] + length;
+    ++tensor_id;
+    numel_offset += length;
+
+    auto chunk_num = (length + chunk_size - 1) / chunk_size;
+    int last_launch_chunk_id = 0;
+    for (int j = 0; j < chunk_num; ++j) {
+      metas.chunk_ids[chunk_id] = j - last_launch_chunk_id;
+      metas.tensor_ids[chunk_id] = tensor_id - 1;
+      ++chunk_id;
+
+      bool tensor_full = (tensor_id == NumTensor && j + 1 == chunk_num);
+      bool block_full = (chunk_id == NumChunk);
+      bool last_chunk = (i + 1 == n && j + 1 == chunk_num);
+
+      if (tensor_full || block_full || last_chunk) {
+        MultiTensorApplyCUDAKernel<Functor, NumTensor,
+                                   NumChunk><<<chunk_id, BlockDim, 0, stream>>>(
+            functor, metas, chunk_size, args...);
+        chunk_id = 0;
+        if (j + 1 == chunk_num) {  // chunk for the current tensor is full
+          metas.start_chunk_id = 0;
+          tensor_id = 0;
+        } else {
+          metas.offsets[0] = metas.offsets[tensor_id - 1];
+          metas.offsets[1] = metas.offsets[tensor_id];
+          metas.start_tensor_id = i;
+          metas.start_chunk_id = j + 1;
+          last_launch_chunk_id = j + 1;
+          tensor_id = 1;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index cb9bbe727de5c014ebfe9ea93f6fe279a897569b..2a127d9ad1db0c1e169fdd1e20a1568b99d228a0 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pixel_shuffle_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -19,56 +22,6 @@ namespace operators {
 class PixelShuffleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::NotFound(
-                          "Input(X) of PixelShuffleOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound(
-                          "Output(Out) of PixelShuffleOp should not be null."));
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input should be a 4-D tensor of format [N, C, H, W] "
-                          "or [N, H, W, C], but got %u.",
-                          input_dims.size()));
-
-    auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
-
-    const std::string data_format =
-        ctx->Attrs().Get<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC");
-
-    if (!channel_last) {
-      PADDLE_ENFORCE_EQ(
-          input_dims[1] % (upscale_factor * upscale_factor), 0,
-          platform::errors::InvalidArgument(
-              "The square of upscale_factor[%u] should divide the "
-              "number of channel[%u]",
-              upscale_factor * upscale_factor, input_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          input_dims[3] % (upscale_factor * upscale_factor), 0,
-          platform::errors::InvalidArgument(
-              "The square of upscale_factor[%u] should divide the "
-              "number of channel[%u]",
-              upscale_factor * upscale_factor, input_dims[3]));
-    }
-    auto output_dims = input_dims;
-    output_dims[0] = input_dims[0];
-    if (!channel_last) {
-      output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
-      output_dims[2] = input_dims[2] * upscale_factor;
-      output_dims[3] = input_dims[3] * upscale_factor;
-    } else {
-      output_dims[1] = input_dims[1] * upscale_factor;
-      output_dims[2] = input_dims[2] * upscale_factor;
-      output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
-    }
-    ctx->SetOutputDim("Out", output_dims);
-  }
 };
 
 class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -171,22 +124,16 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
+                            PT_INFER_META(phi::PixelShuffleInferMeta));
+
 REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
                   ops::PixelShuffleGradMaker<paddle::framework::OpDesc>,
-                  ops::PixelShuffleGradMaker<paddle::imperative::OpBase>);
+                  ops::PixelShuffleGradMaker<paddle::imperative::OpBase>,
+                  PixelShuffleInferShapeFunctor);
 
 REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    pixel_shuffle,
-    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    pixel_shuffle_grad,
-    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
-
 REGISTER_OP_VERSION(pixel_shuffle)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cu b/paddle/fluid/operators/pixel_shuffle_op.cu
deleted file mode 100644
index 6faf91079e1dac00b3516ccde8dc82cec73a79e6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pixel_shuffle_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pixel_shuffle_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    pixel_shuffle, ops::PixelShuffleOpKernel<plat::CUDADeviceContext, float>,
-    ops::PixelShuffleOpKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    pixel_shuffle_grad,
-    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, float>,
-    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h
deleted file mode 100644
index 615bc9772167436aa6aa67e14248a5e853c4350f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PixelShuffleOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int factor = ctx.Attr<int>("upscale_factor");
-
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool channel_last = (data_format == "NHWC");
-
-    auto in_dims = in->dims();
-    auto o_dims = out->dims();
-
-    framework::Tensor t;
-    t.ShareDataWith(*in);
-    if (!channel_last) {
-      t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
-    } else {
-      t.Resize({in_dims[0], in_dims[1], in_dims[2], o_dims[3], factor, factor});
-    }
-    std::vector<int> axis = {0, 1, 4, 2, 5, 3};
-
-    framework::Tensor o;
-    o.ShareDataWith(*out);
-    if (!channel_last) {
-      o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
-    } else {
-      o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]});
-    }
-    phi::funcs::Transpose<DeviceContext, T, 6> trans;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    trans(dev_ctx, t, &o, axis);
-    out->Resize(o_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    int factor = ctx.Attr<int>("upscale_factor");
-
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool channel_last = (data_format == "NHWC");
-
-    auto do_dims = dout->dims();
-    auto dx_dims = dx->dims();
-
-    framework::Tensor t;
-    t.ShareDataWith(*dout);
-    if (!channel_last) {
-      t.Resize(
-          {do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
-    } else {
-      t.Resize(
-          {do_dims[0], dx_dims[1], factor, dx_dims[2], factor, do_dims[3]});
-    }
-    std::vector<int> axis = {0, 1, 3, 5, 2, 4};
-
-    framework::Tensor o;
-    o.ShareDataWith(*dx);
-    if (!channel_last) {
-      o.Resize(
-          {do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
-    } else {
-      o.Resize(
-          {do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor});
-    }
-    phi::funcs::Transpose<DeviceContext, T, 6> trans;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    trans(dev_ctx, t, &o, axis);
-    dx->Resize(dx_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/randint_op.cc b/paddle/fluid/operators/randint_op.cc
index 09c58cd7d4cda396d60a94b02cc8a705bb3c3b01..548e28716dd9108ffd55463cccf9f91ad3b9a941 100644
--- a/paddle/fluid/operators/randint_op.cc
+++ b/paddle/fluid/operators/randint_op.cc
@@ -24,37 +24,6 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class CPURandintKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<int64_t> new_shape;
-    auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensorList");
-    if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
-      if (ctx.HasInput("ShapeTensor")) {
-        auto* shape_tensor = ctx.Input<framework::Tensor>("ShapeTensor");
-        new_shape = GetNewDataFromShapeTensor(shape_tensor);
-      } else if (list_new_shape_tensor.size() > 0) {
-        new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
-      }
-    }
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    if (!new_shape.empty()) out->Resize(phi::make_ddim(new_shape));
-    T* data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t size = out->numel();
-
-    std::uniform_int_distribution<T> dist(ctx.Attr<int>("low"),
-                                          ctx.Attr<int>("high") - 1);
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-  }
-};
-
 class RandintOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -176,6 +145,3 @@ REGISTER_OPERATOR(
     randint, ops::RandintOp, ops::RandintOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
-
-REGISTER_OP_CPU_KERNEL(randint, ops::CPURandintKernel<int>,
-                       ops::CPURandintKernel<int64_t>)
diff --git a/paddle/fluid/operators/randint_op.cu b/paddle/fluid/operators/randint_op.cu
deleted file mode 100644
index 2f9a8cfd142ec7a3d0175b91bd79f239f654c126..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/randint_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/uniform_random_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GPURandintKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int64_t> new_shape;
-    auto list_new_shape_tensor =
-        context.MultiInput<framework::Tensor>("ShapeTensorList");
-    if (list_new_shape_tensor.size() > 0 || context.HasInput("ShapeTensor")) {
-      if (context.HasInput("ShapeTensor")) {
-        auto* shape_tensor = context.Input<framework::Tensor>("ShapeTensor");
-        new_shape = GetNewDataFromShapeTensor(shape_tensor);
-      } else if (list_new_shape_tensor.size() > 0) {
-        new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
-      }
-    }
-
-    platform::CPUPlace cpu;
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* out = context.Output<framework::LoDTensor>("Out");
-    if (!new_shape.empty()) out->Resize(phi::make_ddim(new_shape));
-    T low = static_cast<T>(context.Attr<int>("low"));
-    T high = static_cast<T>(context.Attr<int>("high")) - 1;
-    framework::LoDTensor tensor;
-    tensor.Resize(out->dims());
-    tensor.mutable_data(cpu, framework::TransToPtenDataType(dtype));
-    T* data = tensor.mutable_data<T>(cpu);
-
-    int64_t size = out->numel();
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-
-    /*
-    std::minstd_rand engine;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-    }
-    engine.seed(seed);
-    */
-
-    std::uniform_int_distribution<> dist(context.Attr<int>("low"),
-                                         context.Attr<int>("high") - 1);
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-
-    if (platform::is_gpu_place(context.GetPlace())) {
-      // Copy tensor to out
-      framework::TensorCopy(tensor, context.GetPlace(), out);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(randint, ops::GPURandintKernel<int>,
-                        ops::GPURandintKernel<int64_t>)
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e02f0268b5e510ac8262543db58ee98ef20e517
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMaxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    int out_dtype = context.Attr<int>("out_dtype");
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+    auto input_dims = framework::vectorize(input->dims());
+    const auto& input_dim_size = input->dims().size();
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      for (size_t i = 0; i < input_dims.size(); i++) {
+        reduce_dims.push_back(static_cast<int>(i));
+      }
+    } else {
+      for (size_t i = 0; i < dims.size(); ++i) {
+        if (dims[i] < 0) {
+          reduce_dims.push_back(dims[i] + input_dim_size);
+        } else {
+          reduce_dims.push_back(dims[i]);
+        }
+      }
+    }
+
+    auto place = context.GetPlace();
+    framework::Tensor cast_out(input->type());
+    cast_out.Resize(output->dims());
+    cast_out.mutable_data<T>(place);
+
+    auto cast_out_dtype = framework::TransToProtoVarType(input->dtype());
+
+    if (out_dtype != -1) {
+      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
+    }
+    if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) {
+      if (cast_out_dtype == framework::proto::VarType::FP32) {
+        output->mutable_data<float>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
+        output->mutable_data<paddle::platform::float16>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
+        output->mutable_data<int32_t>(place);
+      }
+    } else {
+      output->ShareDataWith(cast_out);
+    }
+
+    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input->dtype()));
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->dtype()));
+
+    MLUCnnlReduceDesc reduction_desc(
+        reduce_dims, CNNL_REDUCE_MAX, ToCnnlDataType<T>(),
+        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                    nullptr, input_desc.get(), GetBasePtr(input),
+                    0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
+                    GetBasePtr(output));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(reduce_max, ops::ReduceMaxMLUKernel<float>,
+                       ops::ReduceMaxMLUKernel<plat::float16>,
+                       ops::ReduceMaxMLUKernel<int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..daf5965fd54628a097ad1d53057ec54b9a5d329a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMinMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    int out_dtype = context.Attr<int>("out_dtype");
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+    auto input_dims = framework::vectorize(input->dims());
+    const auto& input_dim_size = input->dims().size();
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      for (size_t i = 0; i < input_dims.size(); i++) {
+        reduce_dims.push_back(static_cast<int>(i));
+      }
+    } else {
+      for (size_t i = 0; i < dims.size(); ++i) {
+        if (dims[i] < 0) {
+          reduce_dims.push_back(dims[i] + input_dim_size);
+        } else {
+          reduce_dims.push_back(dims[i]);
+        }
+      }
+    }
+
+    auto place = context.GetPlace();
+    framework::Tensor cast_out(input->type());
+    cast_out.Resize(output->dims());
+    cast_out.mutable_data<T>(place);
+
+    auto cast_out_dtype = framework::TransToProtoVarType(input->dtype());
+
+    if (out_dtype != -1) {
+      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
+    }
+    if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) {
+      if (cast_out_dtype == framework::proto::VarType::FP32) {
+        output->mutable_data<float>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
+        output->mutable_data<paddle::platform::float16>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
+        output->mutable_data<int32_t>(place);
+      }
+    } else {
+      output->ShareDataWith(cast_out);
+    }
+
+    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input->dtype()));
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->dtype()));
+
+    MLUCnnlReduceDesc reduction_desc(
+        reduce_dims, CNNL_REDUCE_MIN, ToCnnlDataType<T>(),
+        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                    nullptr, input_desc.get(), GetBasePtr(input),
+                    0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
+                    GetBasePtr(output));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(reduce_min, ops::ReduceMinMLUKernel<float>,
+                       ops::ReduceMinMLUKernel<plat::float16>,
+                       ops::ReduceMinMLUKernel<int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index ca3575f5dea84321e4fb46cbaa5606652ef267d4..65cca94814e88111239aef3559285d6fe321a72d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -257,12 +257,12 @@ class ReduceKernel : public framework::OpKernel<T> {
     std::vector<int64_t> tmp_dims(dims.begin(), dims.end());
 
     // call new kernel
-    phi::Reduce<typename framework::ConvertToPtenContext<DeviceContext>::TYPE,
-                T, Functor>(
-        static_cast<const typename framework::ConvertToPtenContext<
+    phi::Reduce<typename framework::ConvertToPhiContext<DeviceContext>::TYPE, T,
+                Functor>(
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *input, reduce_all, tmp_dims, keep_dim,
-        framework::TransToPtenDataType(cast_out_dtype), output);
+        framework::TransToPhiDataType(cast_out_dtype), output);
   }
 };
 template <typename DeviceContext, typename OutT, typename Functor>
@@ -541,11 +541,12 @@ class ReduceOp : public framework::OperatorWithKernel {
 #endif
 
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) ||
-                            platform::is_npu_place(ctx.GetPlace()),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "float16 can only be used on GPU or NPU place"));
+      PADDLE_ENFORCE_EQ(
+          platform::is_gpu_place(ctx.GetPlace()) ||
+              platform::is_npu_place(ctx.GetPlace()) ||
+              platform::is_mlu_place(ctx.GetPlace()),
+          true, platform::errors::InvalidArgument(
+                    "float16 can only be used on GPU or NPU or MLU place"));
     }
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -683,7 +684,7 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
     const Tensor* input = context.Input<Tensor>("X");
     Tensor* output = context.Output<Tensor>("Out");
     auto out_dtype = context.Attr<int>("out_dtype");
-    auto pt_out_dtype = paddle::framework::TransToPtenDataType(
+    auto pt_out_dtype = paddle::framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
 
@@ -713,7 +714,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
     auto out_dtype = context.Attr<int>("in_dtype");
-    auto pt_out_dtype = framework::TransToPtenDataType(
+    auto pt_out_dtype = framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
     // get reduce_dim and reduce_num for reduce_mean_grad
     int dim_size = in_x->dims().size();
@@ -734,8 +735,8 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     } else {
       d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype());
     }
-    auto pt_d_out = paddle::experimental::MakePtenDenseTensor(new_d_out);
-    auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x);
+    auto pt_d_out = paddle::experimental::MakePhiDenseTensor(new_d_out);
+    auto pt_d_x = paddle::experimental::MakePhiDenseTensor(*d_x);
     if (out_dtype <= 0) {
       pt_out_dtype = d_out->dtype();
     }
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index ddb598f575f6737f7c7d4336eeee866b12c12fb1..8d99a60b12967a55e0cc208c6ae96c0dabb5f473 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 
 // only can include the headers in paddle/phi/api dirs
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
@@ -476,6 +476,21 @@ class Reshape2Op : public ReshapeOp {
              const framework::VariableNameMap &outputs,
              const framework::AttributeMap &attrs)
       : ReshapeOp(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(XShape) of ReshapeOp should not be null."));
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+
+    ReshapeOp::InferShape(ctx);
+  }
 };
 
 class Reshape2OpMaker : public ReshapeOpMaker {
@@ -636,13 +651,10 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel);
 
-DELCARE_INFER_SHAPE_FUNCTOR(reshape2, ReshapeInferShapeFunctor,
-                            PT_INFER_META(phi::ReshapeWithXShapeInferMeta));
-
 REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
                   ops::Reshape2GradMaker<paddle::framework::OpDesc>,
                   ops::Reshape2GradMaker<paddle::imperative::OpBase>,
-                  ReshapeInferShapeFunctor, ops::ReshapeOpInplaceInferer);
+                  ops::ReshapeOpInplaceInferer);
 REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp,
                   ops::Reshape2DoubleGradMaker<paddle::framework::OpDesc>,
                   ops::Reshape2DoubleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index d6e8f3e5aa1086900d0144ea8757a05776b9c9b0..40f5699a29b355864652b5d899d1918ec663cf0b 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -42,9 +42,9 @@ class ScaleXPUKernel : public framework::OpKernel<T> {
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
     out->mutable_data<T>(in->place());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // call pten kernel
+    // call phi kernel
     phi::ScaleKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
+        static_cast<const typename framework::ConvertToPhiContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *in, scale, bias, bias_after_scale, out);
   }
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index 70733d643673ad8acde9a45f273a52a9723fb0d3..e584c1a4cce1e85344c574526098b034723c3059 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/size_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,13 +23,6 @@ namespace operators {
 class SizeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Size");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Size");
-
-    ctx->SetOutputDim("Out", {1});
-  }
 };
 
 class SizeOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -49,11 +44,10 @@ Return the number of elements in the input.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
+                            PT_INFER_META(phi::SizeInferMeta));
 REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int64_t>,
-                       ops::SizeKernel<paddle::platform::float16>,
-                       ops::SizeKernel<float>, ops::SizeKernel<double>,
-                       ops::SizeKernel<bool>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    SizeInferShapeFunctor);
diff --git a/paddle/fluid/operators/size_op.h b/paddle/fluid/operators/size_op.h
deleted file mode 100644
index 8840fde287d662043d032ec83bc7b7e42ead417d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/size_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class SizeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("Input");
-    auto* out_t = ctx.Output<Tensor>("Out");
-    auto place = ctx.GetPlace();
-    auto out_data = out_t->mutable_data<int64_t>(place);
-    auto cpu_place = platform::CPUPlace();
-    if (place == cpu_place) {
-      out_data[0] = in_t->numel();
-    } else {
-      Tensor cpu_tensor;
-      auto cpu_data =
-          cpu_tensor.mutable_data<int64_t>(out_t->dims(), cpu_place);
-      cpu_data[0] = in_t->numel();
-      paddle::framework::TensorCopy(cpu_tensor, place, out_t);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc
index 5826d2b4a8742b5572e67237139ff8654b2c9e67..95b97025f2969590000e3d336556c0b02ed037de 100644
--- a/paddle/fluid/operators/size_op_npu.cc
+++ b/paddle/fluid/operators/size_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu b/paddle/fluid/operators/softmax_cudnn_op.cu
deleted file mode 100644
index 72c2e97c1782ed2a817241a6d17f5f6f52add4ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_cudnn_op.cu
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, bool LogMode = false>
-class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int input_axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    SoftmaxForwardCUDAKernelDriver<T>(dev_ctx, *x, input_axis, out);
-  }
-};
-
-template <typename T, bool LogMode = false>
-class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    int input_axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx, *out, *dout, input_axis, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxCUDNNKernel<float>,
-                   ops::SoftmaxCUDNNKernel<plat::float16>);
-REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>,
-                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
-#else
-REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxCUDNNKernel<float>,
-                   ops::SoftmaxCUDNNKernel<double>,
-                   ops::SoftmaxCUDNNKernel<plat::float16>);
-REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>,
-                   ops::SoftmaxGradCUDNNKernel<double>,
-                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
-#endif
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index cb97a0bb27cb5c459bf7f2ccd53374759643133f..374992096605bfef0433992193e54306c3a12858 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/softmax_op.h"
-
 #include <memory>
 #include <string>
 #include <unordered_map>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -251,10 +250,3 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
                   ops::SoftmaxOpGradMaker<paddle::imperative::OpBase>,
                   ops::SoftmaxInplaceInferer);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
deleted file mode 100644
index 19359b7eef5126d84f0707d39095a74ae4561186..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
deleted file mode 100644
index 497bbb06dab5f174909684feb0c3bb4546ab3d0e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_op.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline int SizeToAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeFromAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeOutAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis + 1; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename DeviceContext, typename T>
-class SoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<Tensor>("X");
-    auto* Out = context.Output<Tensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = X->dims()[axis];
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-    if (Out->numel() == 0) {
-      return;
-    }
-
-    const int n = SizeToAxis(axis, X->dims());
-    const int d = SizeFromAxis(axis, X->dims());
-    Tensor X_2d, Out_2d;
-    X_2d.ShareDataWith(*X).Resize({n, d});
-    Out_2d.ShareDataWith(*Out).Resize({n, d});
-    math::SoftmaxFunctor<DeviceContext, T, false>()(
-        context.template device_context<DeviceContext>(), axis_dim, &X_2d,
-        &Out_2d);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<Tensor>("Out");
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
-    const int rank = dX->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = dX->dims()[axis];
-
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-    if (dX->numel() == 0) {
-      return;
-    }
-
-    const int n = SizeToAxis(axis, dX->dims());
-    const int d = SizeFromAxis(axis, dX->dims());
-    Tensor dX_2d, Out_2d, dOut_2d;
-    dX_2d.ShareDataWith(*dX).Resize({n, d});
-    Out_2d.ShareDataWith(*Out).Resize({n, d});
-    dOut_2d.ShareDataWith(*dOut).Resize({n, d});
-
-    math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), axis_dim, &Out_2d,
-        &dOut_2d, &dX_2d);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc
index 07e74354bfd7ce930dc0a1b084e668fa2a0983cf..152c8d0a883b09358dc253d65523b30fb59a25b6 100644
--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ b/paddle/fluid/operators/softmax_op_npu.cc
@@ -12,8 +12,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -51,7 +52,7 @@ class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
 
     auto dims = dX->dims();
     const int rank = dims.size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
     int64_t first_dim = 1;
     int64_t sec_dim = 1;
     for (int i = 0; i < axis; i++) {
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index defda1a3b04a62254cc6ccbfe254f739cc31f909..3bc55fafd81e18d0a986268ff4692129c6515edc 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
index a29804e505f66f8ee4bf4eb281886b45963f537c..1ed13c8bd1baea28301814d788af67954ee7932a 100644
--- a/paddle/fluid/operators/softmax_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -29,7 +29,7 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
     auto* x = context.Input<Tensor>("X");
     auto* out = context.Output<Tensor>("Out");
     const int rank = x->dims().size();
-    int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
 
     // allocate memory on device.
     out->mutable_data<T>(context.GetPlace());
@@ -88,7 +88,7 @@ class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
     auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
     const int rank = dx->dims().size();
-    int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
 
     // allocate memory on device.
     dx->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index cba779d0a77d0037c596de1de3b486bf567c05f2..6f0881e9fc98f6c1ce6c7535c9c68a2fe64e2241 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -153,7 +153,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
                           "Attr(axis) value should be in range [-R, R-1], "
                           "R is the rank of Input(Logits)."));
 
-    axis = CanonicalAxis(axis, logits_rank);
+    axis = phi::funcs::CanonicalAxis(axis, logits_rank);
     for (int i = 0; i < logits_rank; i++) {
       if (i != axis) {
         if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
@@ -250,7 +250,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
                           "Attr(axis) value should be in range [-R, R-1], "
                           "R is the rank of Input(Logits)."));
 
-    axis = CanonicalAxis(axis, softmax_rank);
+    axis = phi::funcs::CanonicalAxis(axis, softmax_rank);
     for (int i = 0; i < softmax_rank; i++) {
       if (i != axis) {
         if (ctx->IsRuntime() || (softmax_dims[i] > 0 && labels_dims[i] > 0)) {
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 2bbacef596e5916a7c82c29a83f85e9b5932b2d4..19a395e72314db52d52cf704a567dce8dd58318a 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -17,19 +17,22 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
 namespace operators {
 
+#define ALIGN_BYTES 16
+
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
+namespace kps = phi::kps;
 
 // Wrapper of log function. Use log(float32) for float16
 template <typename T>
@@ -47,6 +50,18 @@ static __device__ __forceinline__ T Exp(T x) {
   return math::TolerableValue<T>()(static_cast<T>(expx));
 }
 
+template <typename Tx, typename Ty = Tx>
+struct ExpAddFunctor {
+  HOSTDEVICE inline ExpAddFunctor(Tx max) : max(max) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& sum, const Tx& x) const {
+    return static_cast<Ty>(sum + std::exp(x - max));
+  }
+
+ private:
+  Tx max;
+};
+
 // log2(value)
 static inline int Log2Ceil(int value) {
   int log2_value = 0;
@@ -236,7 +251,7 @@ __global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src,
       max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
     }
   }
-  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
+  phi::WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
 
   // compute sum: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} }
   AccT sum[kBatchSize];
@@ -276,7 +291,7 @@ __global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src,
       }
     }
   }
-  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
+  phi::WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
 // write data
 #pragma unroll
@@ -419,10 +434,272 @@ void SwitchWarpSoftmaxForward(T* loss, T* softmax, const T* src,
   }
 }
 
+template <typename T, bool IgnoreIndex>
+__device__ __forceinline__ void ComputeLoss(T* loss, const T loss_value,
+                                            const int label_id,
+                                            const int64_t label_value,
+                                            const int tid, const int vec_size,
+                                            const int offset,
+                                            const int ignore_index) {
+  int loss_id = vec_size * tid + offset;
+  if (IgnoreIndex) {
+    if (label_value == loss_id) {
+      if (label_value == ignore_index) {
+        loss[label_id] = static_cast<T>(0.0f);
+      } else {
+        loss[label_id] = loss_value;
+      }
+    }
+  } else {
+    if (label_value == loss_id) {
+      loss[label_id] = loss_value;
+    }
+  }
+}
+
+template <typename T, typename AccT, int VecSize, class ReduceFunctor>
+__device__ __forceinline__ AccT ThreadReduce(const T* input, int size,
+                                             const int offset, AccT init,
+                                             ReduceFunctor reducer) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+  int tid = threadIdx.x;
+  AccT val = init;
+
+  if (offset > 0) {
+    input -= offset;
+    size += offset;
+    if (tid >= offset) {
+      val = reducer(val, input[tid]);
+    }
+    size -= blockDim.x;
+    input += blockDim.x;
+  }
+  int remain = size % (VecSize * blockDim.x);
+
+  T ins[VecSize];
+  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+
+  // vector part
+  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
+    *ins_vec = reinterpret_cast<const VecT*>(input)[tid];
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      val = reducer(val, ins[i]);
+    }
+  }
+
+  // scalar part
+  tid = size - remain + threadIdx.x;
+  for (; tid < size; tid += blockDim.x) {
+    val = reducer(val, input[tid]);
+  }
+  return val;
+}
+
+template <typename T, typename AccT, typename LabelT, int VecSize,
+          bool IgnoreIndex>
+__device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
+    T* loss, T* softmax, const T* logits, const LabelT* label, int size,
+    const int offset, const phi::LogSoftmaxForwardFunctor<AccT>& func,
+    const int ignore_index) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+  int tid = threadIdx.x;
+  int label_id = blockIdx.x;
+  auto label_value = static_cast<int64_t>(label[label_id]);
+  const bool label_valid = label_value >= 0 && label_value < size;
+  int loss_id_offset = 0;
+
+  if (offset > 0) {
+    logits -= offset;
+    softmax -= offset;
+    size += offset;
+    loss_id_offset -= offset;
+    if (tid >= offset) {
+      AccT log_softmax = func(static_cast<AccT>(logits[tid]));
+      softmax[tid] = static_cast<T>(std::exp(log_softmax));
+      // loss
+      if (label_valid) {
+        ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax),
+                                    label_id, label_value, tid, 1,
+                                    loss_id_offset, ignore_index);
+      }
+    }
+    size -= blockDim.x;
+    logits += blockDim.x;
+    softmax += blockDim.x;
+    loss_id_offset += blockDim.x;
+  }
+  int remain = size % (VecSize * blockDim.x);
+
+  T ins[VecSize];
+  T outs[VecSize];
+  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+  VecT* outs_vec = reinterpret_cast<VecT*>(&outs);
+
+  // vector part
+  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
+    // read
+    *ins_vec = reinterpret_cast<const VecT*>(logits)[tid];
+
+#pragma unroll
+    // compute
+    for (int i = 0; i < VecSize; ++i) {
+      AccT log_softmax = func(static_cast<AccT>(ins[i]));
+      outs[i] = static_cast<T>(std::exp(log_softmax));
+
+      // loss
+      if (label_valid) {
+        ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax),
+                                    label_id, label_value, tid, VecSize,
+                                    loss_id_offset + i, ignore_index);
+      }
+    }
+
+    // write
+    reinterpret_cast<VecT*>(softmax)[tid] = *outs_vec;
+  }
+
+  // scalar part
+  tid = size - remain + threadIdx.x;
+  for (; tid < size; tid += blockDim.x) {
+    AccT log_softmax = func(static_cast<AccT>(logits[tid]));
+    softmax[tid] = static_cast<T>(std::exp(log_softmax));
+
+    // loss
+    if (label_valid) {
+      ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax), label_id,
+                                  label_value, tid, 1, loss_id_offset,
+                                  ignore_index);
+    }
+  }
+
+  // invalid label, write once
+  if (!label_valid && threadIdx.x == 0) {
+    loss[label_id] = static_cast<T>(0.0f);
+  }
+}
+
+template <typename T, typename AccT, typename LabelT, int VecSize,
+          bool IgnoreIndex>
+__device__ __forceinline__ void ScalarSoftmaxForwardImpl(
+    T* loss, T* softmax, const T* logits, const LabelT* label, const int size,
+    const phi::LogSoftmaxForwardFunctor<AccT>& func, const int ignore_index) {
+  int tid = threadIdx.x;
+  int remain = size % (VecSize * blockDim.x);
+  int label_id = blockIdx.x;
+  auto label_value = static_cast<int64_t>(label[label_id]);
+  const bool label_valid = label_value >= 0 && label_value < size;
+
+  // main part
+  for (; tid < (size - remain); tid += VecSize * blockDim.x) {
+    T ins[VecSize];
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      ins[i] = logits[tid + i * blockDim.x];
+    }
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      AccT log_softmax = func(static_cast<AccT>(ins[i]));
+      softmax[tid + i * blockDim.x] = static_cast<T>(std::exp(log_softmax));
+      // loss
+      if (label_valid) {
+        ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax),
+                                    label_id, label_value, tid, VecSize, i,
+                                    ignore_index);
+      }
+    }
+  }
+
+  // tail part
+  for (; tid < size; tid += blockDim.x) {
+    AccT log_softmax = func(static_cast<AccT>(logits[tid]));
+    softmax[tid] = static_cast<T>(std::exp(log_softmax));
+    // loss
+    if (label_valid) {
+      ComputeLoss<T, IgnoreIndex>(loss, static_cast<T>(-log_softmax), label_id,
+                                  label_value, tid, 1, 0, ignore_index);
+    }
+  }
+
+  // invalid label, write once
+  if (!label_valid && threadIdx.x == 0) {
+    loss[label_id] = static_cast<T>(0.0f);
+  }
+}
+
+template <typename T, typename AccT, typename LabelT, int VecSize,
+          bool IgnoreIndex>
+__global__ void VectorizedSoftmaxForward(T* loss, T* softmax, const T* logits,
+                                         const LabelT* label,
+                                         const int high_dim, const int mid_dim,
+                                         const int ignore_index) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+
+  // each block deal with one batch
+  logits += blockIdx.x * mid_dim;
+  softmax += blockIdx.x * mid_dim;
+
+  const int input_offset = ((uint64_t)logits) % ALIGN_BYTES / sizeof(T);
+  const int output_offset = ((uint64_t)softmax) % ALIGN_BYTES / sizeof(T);
+
+  // 1. reduce max
+  AccT max = ThreadReduce<T, AccT, VecSize, kps::MaxFunctor<AccT>>(
+      logits, mid_dim, input_offset, -std::numeric_limits<AccT>::infinity(),
+      kps::MaxFunctor<AccT>());
+  max = kps::details::BlockXReduce<AccT, kps::MaxFunctor<AccT>>(
+      max, kps::MaxFunctor<AccT>());
+
+  // 2. reduce sum
+  AccT sum = ThreadReduce<T, AccT, VecSize, ExpAddFunctor<AccT>>(
+      logits, mid_dim, input_offset, static_cast<AccT>(0),
+      ExpAddFunctor<AccT>(max));
+  sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      sum, kps::AddFunctor<AccT>());
+
+  // 3. softmax
+  phi::LogSoftmaxForwardFunctor<AccT> func(max, sum);
+  if (input_offset == output_offset) {
+    VectorizedSoftmaxForwardImpl<T, AccT, LabelT, VecSize, IgnoreIndex>(
+        loss, softmax, logits, label, mid_dim, input_offset, func,
+        ignore_index);
+  } else {
+    ScalarSoftmaxForwardImpl<T, AccT, LabelT, VecSize, IgnoreIndex>(
+        loss, softmax, logits, label, mid_dim, func, ignore_index);
+  }
+}
+
+template <typename T, typename LabelT, bool IgnoreIndex>
+void LaunchVectorizedSoftmaxForward(T* loss, T* softmax, const T* logits,
+                                    const LabelT* label, const int high_dim,
+                                    const int mid_dim, const int ignore_index,
+                                    gpuStream_t stream) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  constexpr int vec_size = sizeof(float4) / sizeof(T);
+  const int max_num_threads = 1024;
+  int max_block_size = std::min(mid_dim / vec_size, max_num_threads);
+  if (vec_size > 1) {
+    max_block_size /= 2;
+  }
+
+  int block_size = 1;
+  while (block_size < max_block_size) {
+    block_size *= 2;
+  }
+  block_size = std::max(block_size, kps::details::kWarpSize);
+  dim3 grids(high_dim);
+  dim3 blocks(block_size);
+  VectorizedSoftmaxForward<T, AccT, LabelT, vec_size,
+                           IgnoreIndex><<<grids, blocks, 0, stream>>>(
+      loss, softmax, logits, label, high_dim, mid_dim, ignore_index);
+}
+
 /*
   Wrapper of softmax with cross entropy hard label.
-  - SwitchWarpSoftmaxForward for small size
-  - cudnn function for large size
+  - SwitchWarpSoftmaxForward for small size when axis == -1
+  - LaunchVectorizedSoftmaxForward for large size when axis == -1
+  - cudnn function for axis != -1
 */
 template <typename T, typename LabelT, bool IgnoreIndex>
 static void SoftmaxWithCrossEntropyHardLabel(
@@ -431,11 +708,17 @@ static void SoftmaxWithCrossEntropyHardLabel(
     T* softmax_data, int N, int dim, int D, const int ignore_index) {
   auto stream = ctx.stream();
   constexpr int max_dim = 320;
-  if (D == 1 && dim <= max_dim) {  // small size
-    const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
-    SwitchWarpSoftmaxForward<T, LabelT, mode, IgnoreIndex>(
-        loss_data, softmax_data, logits_data, labels_data, N, dim, dim,
-        ignore_index, stream);
+  if (D == 1) {
+    if (dim <= max_dim) {  // small size
+      const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
+      SwitchWarpSoftmaxForward<T, LabelT, mode, IgnoreIndex>(
+          loss_data, softmax_data, logits_data, labels_data, N, dim, dim,
+          ignore_index, stream);
+    } else {  // large size
+      LaunchVectorizedSoftmaxForward<T, LabelT, IgnoreIndex>(
+          loss_data, softmax_data, logits_data, labels_data, N, dim,
+          ignore_index, stream);
+    }
   } else {
     ScopedTensorDescriptor desc;
     std::vector<int> tensor_dims = {N, dim, D, 1};
@@ -566,7 +849,7 @@ __global__ void CrossEntropySoftLabel(T* loss, T* softmaxwrt, const T* softmax,
       }
     }
   }
-  WarpReduceSum<T, kBatchSize, kWarpSize>(sum);
+  phi::WarpReduceSum<T, kBatchSize, kWarpSize>(sum);
   __syncthreads();
 
   __shared__ T sumshare[kWarpPerBatch][kBatchPerBlock][kBatchSize];
@@ -674,7 +957,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src,
                          : static_cast<AccT>(valmax);
     }
   }
-  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
+  phi::WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
 
   // compute sum
   AccT sum[kBatchSize]{0.0};
@@ -694,7 +977,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src,
       }
     }
   }
-  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
+  phi::WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
   // log_softmax and loss
   AccT sumloss[kBatchSize]{0.0};
@@ -737,7 +1020,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src,
   }
 
   // loss
-  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sumloss);
+  phi::WarpReduceSum<AccT, kBatchSize, kWarpSize>(sumloss);
 
   for (int i = 0; i < kBatchSize; i++) {
     if (i >= local_batches) break;
@@ -950,11 +1233,12 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       Tensor* loss = context.Output<Tensor>("Loss");
 
       const int rank = softmax->dims().size();
-      const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+      const int axis =
+          phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
       const int axis_dim = softmax->dims()[axis];
 
-      const int n = SizeToAxis(axis, softmax->dims());
-      const int d = SizeFromAxis(axis, softmax->dims());
+      const int n = phi::funcs::SizeToAxis(axis, softmax->dims());
+      const int d = phi::funcs::SizeFromAxis(axis, softmax->dims());
 
       auto* softmax_out_data =
           softmax_out->template mutable_data<T>(context.GetPlace());
@@ -1035,11 +1319,11 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     Tensor* loss = context.Output<Tensor>("Loss");
 
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     int axis_dim = logits->dims()[axis];
 
-    const int64_t n = SizeToAxis(axis, logits->dims());
-    const int64_t d = SizeFromAxis(axis, logits->dims());
+    const int64_t n = phi::funcs::SizeToAxis(axis, logits->dims());
+    const int64_t d = phi::funcs::SizeFromAxis(axis, logits->dims());
 
     auto* softmax_data = softmax->template mutable_data<T>(context.GetPlace());
     auto* loss_data = loss->template mutable_data<T>(context.GetPlace());
@@ -1118,11 +1402,11 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     T* logit_grad_data = logit_grad->template data<T>();
 
     const int rank = logit_grad->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     int axis_dim = logit_grad->dims()[axis];
 
-    const int64_t n = SizeToAxis(axis, logit_grad->dims());
-    const int64_t d = SizeFromAxis(axis, logit_grad->dims());
+    const int64_t n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
+    const int64_t d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
     const int64_t remain = d / axis_dim;
 
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index a7f88dd0ec38e55a7f1d0ea79436cdd376d14393..4b875cbf5841f661b55e668808051c8928b45cdd 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -84,7 +84,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
       Tensor* softmax_out = context.Output<Tensor>("Softmax");
       Tensor* loss = context.Output<Tensor>("Loss");
       const int rank = softmax->dims().size();
-      const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+      const int axis =
+          phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
       int axis_dim = softmax->dims()[axis];
 
       PADDLE_ENFORCE_GT(
@@ -97,7 +98,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
       softmax_out->mutable_data<T>(context.GetPlace());
       loss->mutable_data<T>(context.GetPlace());
 
-      const int n = SizeToAxis(axis, softmax->dims());
+      const int n = phi::funcs::SizeToAxis(axis, softmax->dims());
 
       PADDLE_ENFORCE_GT(
           n, 0, platform::errors::InvalidArgument(
@@ -105,7 +106,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
                     "SizeToAxis of softmax is %d.",
                     n));
 
-      const int d = SizeFromAxis(axis, softmax->dims());
+      const int d = phi::funcs::SizeFromAxis(axis, softmax->dims());
 
       Tensor softmax_2d, labels_2d, loss_2d, softmax_out_2d;
       softmax_2d.ShareDataWith(*softmax).Resize({n, d});
@@ -133,7 +134,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     Tensor* loss = context.Output<Tensor>("Loss");
 
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     int axis_dim = logits->dims()[axis];
     PADDLE_ENFORCE_GT(
         axis_dim, 0,
@@ -145,14 +146,14 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    const int n = SizeToAxis(axis, logits->dims());
+    const int n = phi::funcs::SizeToAxis(axis, logits->dims());
     PADDLE_ENFORCE_GT(
         n, 0, platform::errors::InvalidArgument(
                   "The size of axis should be larger than 0, but received "
                   "SizeToAxis of logits is %d.",
                   n));
 
-    const int d = SizeFromAxis(axis, logits->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
     Tensor logits_2d, softmax_2d, labels_2d, loss_2d;
     logits_2d.ShareDataWith(*logits).Resize({n, d});
     softmax_2d.ShareDataWith(*softmax).Resize({n, d});
@@ -192,7 +193,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     auto ignore_index = context.Attr<int>("ignore_index");
 
     const int rank = logit_grad->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     int axis_dim = logit_grad->dims()[axis];
     PADDLE_ENFORCE_GT(
         axis_dim, 0,
@@ -201,14 +202,14 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
             "axis dimention is %d.",
             axis_dim));
 
-    const int n = SizeToAxis(axis, logit_grad->dims());
+    const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
     PADDLE_ENFORCE_GT(
         n, 0, platform::errors::InvalidArgument(
                   "The size of axis should be larger than 0, but received "
                   "SizeToAxis of logit_grad is %d.",
                   n));
 
-    const int d = SizeFromAxis(axis, logit_grad->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
     Tensor logit_grad_2d, labels_2d, out_grad_2d;
     logit_grad_2d.ShareDataWith(*logit_grad).Resize({n, d});
     labels_2d.ShareDataWith(labels).Resize({n, labels.numel() / n});
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
index a51f68530caf88a8f5abe2b4615180266f409a8c..1cd6f8b7698b949a8e198c766fcf193e13481298 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
@@ -87,7 +87,7 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
                         platform::errors::InvalidArgument(
                             "If soft_label=False, axis must be -1 or"
                             " can be regard as last dimention in mlu kernel."));
-      framework::Tensor labels_int32(framework::TransToPtenDataType(VT::INT32));
+      framework::Tensor labels_int32(framework::TransToPhiDataType(VT::INT32));
       labels_int32.Resize(labels->dims());
       labels_int32.mutable_data<int32_t>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
index a5576ab5af3fdeb38b6c1fe87aff32dab412c0be..1f1fbea090c13f2eff7e389c9b7c4774ccbb7700 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
@@ -40,15 +41,16 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
                           "the npu kernel of softmax_with_cross_entropy."));
 
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    const int n = SizeToAxis(axis, logits->dims());
-    const int d = SizeFromAxis(axis, logits->dims());
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int n = phi::funcs::SizeToAxis(axis, logits->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
 
     PADDLE_ENFORCE_EQ(
         labels->numel(), n,
         platform::errors::Unimplemented(
-            "The size of labels should be equal to SizeToAxis of logits,"
-            "but got size of labels is %d and SizeToAxis is %d.",
+            "The size of labels should be equal to phi::funcs::SizeToAxis of "
+            "logits,"
+            "but got size of labels is %d and phi::funcs::SizeToAxis is %d.",
             labels->numel(), n));
 
     loss->mutable_data<T>(ctx.GetPlace());
@@ -97,9 +99,9 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
     logits_grad->mutable_data<T>(ctx.GetPlace());
 
     const int rank = logits_grad->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    const int n = SizeToAxis(axis, logits_grad->dims());
-    const int d = SizeFromAxis(axis, logits_grad->dims());
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int n = phi::funcs::SizeToAxis(axis, logits_grad->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logits_grad->dims());
 
     Tensor logits_grad_2d, loss_grad_1d, backprop_2d;
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
index 650e488c5e10b8153e4399b0c9eb4fe38a05b215..d9149b85c6a0f15a27dccf3564e50838e34b00c8 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -38,13 +38,13 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
     Tensor* softmax = context.Output<Tensor>("Softmax");
     Tensor* loss = context.Output<Tensor>("Loss");
     const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument(
                                           "axis should == rank - 1"));
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
-    const int n = SizeToAxis(axis, logits->dims());
-    const int d = SizeFromAxis(axis, logits->dims());
+    const int n = phi::funcs::SizeToAxis(axis, logits->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
     std::vector<int> logits_dims = phi::vectorize<int>(logits->dims());
     const bool soft_label = context.Attr<bool>("soft_label");
 
@@ -122,11 +122,11 @@ class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel<T> {
     auto ignore_index = context.Attr<int>("ignore_index");
 
     const int rank = logit_grad->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
     PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument(
                                           "axis should == rank - 1"));
-    const int n = SizeToAxis(axis, logit_grad->dims());
-    const int d = SizeFromAxis(axis, logit_grad->dims());
+    const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
+    const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
 
     auto& dev_ctx =
         context.template device_context<platform::XPUDeviceContext>();
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
index fe76448a185c956d8c08c600f0e0f887e2d057b8..db3dc214bfe7ae7ae7facc59deca71ce9dfe91f6 100644
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -25,9 +25,10 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
 
 #if defined(PADDLE_WITH_ONEMKL)
-#include "paddle/fluid/platform/dynload/mklrt.h"
+#include "paddle/phi/backends/dynload/mklrt.h"
 #elif defined(PADDLE_WITH_POCKETFFT)
 #include "extern_pocketfft/pocketfft_hdronly.h"
 #endif
@@ -357,12 +358,12 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
 // FFT Functors
 #if defined(PADDLE_WITH_ONEMKL)
 
-#define MKL_DFTI_CHECK(expr)                                       \
-  do {                                                             \
-    MKL_LONG status = (expr);                                      \
-    if (!platform::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \
-      PADDLE_THROW(platform::errors::External(                     \
-          platform::dynload::DftiErrorMessage(status)));           \
+#define MKL_DFTI_CHECK(expr)                                                   \
+  do {                                                                         \
+    MKL_LONG status = (expr);                                                  \
+    if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR))                  \
+      PADDLE_THROW(                                                            \
+          platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
   } while (0);
 
 namespace {
@@ -370,7 +371,7 @@ namespace {
 struct DftiDescriptorDeleter {
   void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
     if (handle != nullptr) {
-      MKL_DFTI_CHECK(platform::dynload::DftiFreeDescriptor(&handle));
+      MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
     }
   }
 };
@@ -385,7 +386,7 @@ class DftiDescriptor {
                           "DftiDescriptor has already been initialized."));
 
     DFTI_DESCRIPTOR* raw_desc;
-    MKL_DFTI_CHECK(platform::dynload::DftiCreateDescriptorX(
+    MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
         &raw_desc, precision, signal_type, signal_ndim, sizes));
     desc_.reset(raw_desc);
   }
@@ -437,21 +438,21 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
   descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
 
   // placement inplace or not inplace
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
-      descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
+                                            DFTI_NOT_INPLACE));
 
   // number of transformations
   const MKL_LONG batch_size = fft_sizes[0];
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
       descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
 
   // input & output distance
   const MKL_LONG idist = in_strides[0];
   const MKL_LONG odist = out_strides[0];
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(),
-                                                 DFTI_INPUT_DISTANCE, idist));
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(),
-                                                 DFTI_OUTPUT_DISTANCE, odist));
+  MKL_DFTI_CHECK(
+      phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
+                                            DFTI_OUTPUT_DISTANCE, odist));
 
   // input & output stride
   std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
@@ -460,14 +461,14 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
     mkl_in_stride[i] = in_strides[i];
     mkl_out_stride[i] = out_strides[i];
   }
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
       descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
-  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
       descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
 
   // conjugate even storage
   if (!(fft_type == FFTTransformType::C2C)) {
-    MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+    MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
         descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
   }
 
@@ -489,12 +490,12 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
         return DFTI_BACKWARD_SCALE;
       }
     }();
-    MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(),
-                                                   scale_direction, scale));
+    MKL_DFTI_CHECK(
+        phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
   }
 
   // commit the descriptor
-  MKL_DFTI_CHECK(platform::dynload::DftiCommitDescriptor(descriptor.get()));
+  MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
   return descriptor;
 }
 
@@ -575,39 +576,39 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
                     framework::TransToProtoVarType(out->dtype()), input_stride,
                     output_stride, signal_sizes, normalization, forward);
 
-  const FFTTransformType fft_type = GetFFTTransformType(x->type(), out->type());
+  const FFTTransformType fft_type =
+      GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
+                          framework::TransToProtoVarType(out->type()));
   if (fft_type == FFTTransformType::C2R && forward) {
-    framework::Tensor collapsed_input_conj(
-        framework::TransToProtoVarType(collapsed_input.dtype()));
+    framework::Tensor collapsed_input_conj(collapsed_input.dtype());
     collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
                                           ctx.GetPlace());
     // conjugate the input
     platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
-    math::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
-                                  collapsed_input.numel(),
-                                  collapsed_input_conj.data<Ti>());
+    phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
+                                        collapsed_input.numel(),
+                                        collapsed_input_conj.data<Ti>());
     for_range(functor);
-    MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
         desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
   } else if (fft_type == FFTTransformType::R2C && !forward) {
-    framework::Tensor collapsed_output_conj(
-        framework::TransToProtoVarType(collapsed_output.dtype()));
+    framework::Tensor collapsed_output_conj(collapsed_output.dtype());
     collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
                                            ctx.GetPlace());
-    MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
         desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
     // conjugate the output
     platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
-    math::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
-                                  collapsed_output.numel(),
-                                  collapsed_output.data<To>());
+    phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
+                                        collapsed_output.numel(),
+                                        collapsed_output.data<To>());
     for_range(functor);
   } else {
     if (forward) {
-      MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
           desc.get(), collapsed_input.data(), collapsed_output.data()));
     } else {
-      MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
           desc.get(), collapsed_input.data(), collapsed_output.data()));
     }
   }
diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc
index f04ba72a1e181654466acac52ffe58cd74cdc2da..a7c7e33f58af6ce8f59a301d1fc5ccdf511b608f 100644
--- a/paddle/fluid/operators/test_common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 
 USE_OP(relu);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(softmax);
+USE_OP_ITSELF(softmax);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/top_k_op_mlu.cc b/paddle/fluid/operators/top_k_op_mlu.cc
index a9f835f6fe2c25d6ffdfae93e1c7cd170db6b891..102902bdaaaaf4a6a94699f561a5e91213be8c44 100644
--- a/paddle/fluid/operators/top_k_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_op_mlu.cc
@@ -47,7 +47,7 @@ class TopkMLUKernel : public framework::OpKernel<T> {
     const bool sorted = true;
     const int axis = -1;
     // cnnl only support int32/int16 type of indices
-    framework::Tensor indices_int32(framework::TransToPtenDataType(VT::INT32));
+    framework::Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
     indices_int32.Resize(indices->dims());
     indices_int32.mutable_data<int32_t>(place);
 
diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc
index 7bada0179a1c5e73669b07fd77171f764db6e21c..5b8a6b3e75449508afa5d316d81f97ab815c9ea9 100644
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
@@ -55,7 +55,7 @@ class TopkV2MLUKernel : public framework::OpKernel<T> {
     indices->mutable_data<int64_t>(place);
 
     // cnnl only support int32/int16 type of indices
-    framework::Tensor indices_int32(framework::TransToPtenDataType(VT::INT32));
+    framework::Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
     indices_int32.Resize(indices->dims());
     indices_int32.mutable_data<int32_t>(place);
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index f980e007271e3cfd8cf80c2f69ee32cde12aff0f..6eb7f922dfdbec41aa1c47d11e1decc259d08689 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -23,28 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                           1.0);
-    TruncatedNormal<T> truncated_normal(mean, std);
-    int64_t size = tensor->numel();
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = truncated_normal(dist(*engine));
-    }
-  }
-};
-
 class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -124,5 +102,3 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
                              ops::TruncatedGaussianRandomOp,
                              ops::TruncatedGaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(truncated_gaussian_random,
-                       ops::CPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
deleted file mode 100644
index 5e530a5bb5248e79d6ba19b23f86788a2eb3315f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include <limits>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GPUTruncatedNormal {
-  T mean, std;
-  T a_normal_cdf;
-  T b_normal_cdf;
-  unsigned int seed;
-  T numeric_min;
-
-  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
-      : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
-    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
-    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
-  }
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
-    rng.discard(n);
-    T value = dist(rng);
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
-  }
-};
-
-template <typename T>
-struct TruncatedNormalOffset {
-  T mean, std;
-  T a_normal_cdf;
-  T b_normal_cdf;
-  unsigned int seed;
-  T numeric_min;
-  int offset_;
-
-  __host__ __device__ TruncatedNormalOffset(T mean, T std, T numeric_min,
-                                            int seed, int offset)
-      : mean(mean),
-        std(std),
-        seed(seed),
-        numeric_min(numeric_min),
-        offset_(offset) {
-    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
-    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
-  }
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
-    rng.discard(n + offset_);
-    T value = dist(rng);
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
-  }
-};
-
-template <typename T>
-class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          TruncatedNormalOffset<T>(mean, std, std::numeric_limits<T>::min(),
-                                   seed_offset.first, gen_offset));
-    } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GPUTruncatedNormal<T>(
-                            mean, std, std::numeric_limits<T>::min(), seed));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    truncated_gaussian_random,
-    paddle::operators::GPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu
index 1c7b9a27f868821ceb20c720548b4df0ee6bcd40..b8d8467b7eba9f360d8b2043bd4ed3f63e42725a 100644
--- a/paddle/fluid/operators/uniform_random_inplace_op.cu
+++ b/paddle/fluid/operators/uniform_random_inplace_op.cu
@@ -36,7 +36,7 @@ class GPUUniformRandomInplaceGradKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::CUDADeviceContext>();
     float value = static_cast<float>(0.0f);
     phi::FullKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
             paddle::platform::CUDADeviceContext>::TYPE&>(dev_cxt),
         dims, value, phi::DataType::UNDEFINED, dx);
   }
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 37709c953e13b07a9cead3684275a521333fa92a..04c8a329e5e1a3cc7177a09d592d46ba3ac700ec 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -28,7 +28,7 @@ cc_library(denormal SRCS denormal.cc DEPS)
 
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
 
-set(enforce_deps flags errors boost flags pten_enforce)
+set(enforce_deps flags errors boost flags phi_enforce)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
@@ -52,7 +52,7 @@ ELSE()
     cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
 ENDIF()
 
-cc_library(place SRCS place.cc DEPS enforce boost pten_place)
+cc_library(place SRCS place.cc DEPS enforce boost phi_place)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 IF(WITH_MKLDNN)
@@ -122,7 +122,7 @@ cc_library(init SRCS init.cc DEPS device_context custom_kernel)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
+    place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator)
 if(WITH_XPU)
   target_link_libraries(device_context xpu_context)
@@ -138,7 +138,7 @@ if(WITH_CNCL)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
-    target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
+    target_link_libraries(device_context gpu_info gpu_context phi_gpu_info)
     target_link_libraries(device_context gpu_resource_pool)
 endif()
 if (WITH_CUSTOM_DEVICE)
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 00f0cc2ac92bf7edac1766358f14651844570cd9..f7c13ec7ed5edc034813360322967b6cb4643087 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -1,12 +1,12 @@
 IF(WITH_GPU)
     add_subdirectory(cuda)
-    nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda)
 
     nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
     nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
 ELSEIF(WITH_ROCM)
     add_subdirectory(rocm)
-    hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda)
 
     hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
     hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
index 8f7fd3dcbc03a2a780c70ce4e8599db864747da1..85050038d5a8363b005ed2397c9f6c3c03f18b62 100644
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -1,4 +1,4 @@
 nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
 nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
 
-nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten)
+nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda phi)
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index 8616e969f69dfd469fec0372d40f6365e5038425..8aec8e840f33273a3130355c751e635e4a3f6736 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -210,6 +210,12 @@ template <typename T, typename std::enable_if<std::is_same<
                           platform::float16, T>::value>::type * = nullptr>
 __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
     const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+#if ((CUDA_VERSION < 10000) || \
+     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+#else
   int i = 0;
   int loops = len / 2 * 2;
 
@@ -233,6 +239,7 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
       fastAtomicAdd(out, i, len, in[i]);
     }
   }
+#endif
 }
 #endif
 #endif
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 91ab7f3f4f052707ce7ae57147169889cdc4c259..c124d58957fe642365bd5bbf074bc15bfd74c6ba 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -113,7 +113,7 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
     auto fetch_dtype = fetch_info.dataType();
     auto paddle_type = PopartType2VarType(fetch_dtype);
     tensor->mutable_data(ctx.GetPlace(),
-                         framework::TransToPtenDataType(paddle_type));
+                         framework::TransToPhiDataType(paddle_type));
     anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor));
     popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
   }
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc
index 90c0851d79d8079d35c4bf035f130c9c86089c7e..d45492391dc88ce0c690e0768e080dd989a0539c 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
@@ -467,7 +467,7 @@ void NpuOpRunner::TypeAdapter(
     } else {
       tmp_inputs[i].Resize(inputs[i].dims());
       tmp_inputs[i].mutable_data(dev_ctx.GetPlace(),
-                                 framework::TransToPtenDataType(input_type[i]));
+                                 framework::TransToPhiDataType(input_type[i]));
 
       const auto &cast_runner = NpuOpRunner(
           "Cast", {inputs[i]}, {tmp_inputs[i]},
@@ -484,7 +484,7 @@ void NpuOpRunner::TypeAdapter(
     } else {
       tmp_outputs[i].Resize(outputs[i].dims());
       tmp_outputs[i].mutable_data(
-          dev_ctx.GetPlace(), framework::TransToPtenDataType(output_type[i]));
+          dev_ctx.GetPlace(), framework::TransToPhiDataType(output_type[i]));
     }
   }
 
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index 28573eb0c1e4ce2a8e6d7a2ba2d61edb6941ce51..b6a26f2554a131aab6e87146c241dc973d9c8f56 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
 
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place phi_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context op_kernel_type)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index aa020593454f8f74659ac1a6ba1e5205b2075ec6..f79ef8505d878b28125aaf84574942fb1698de8b 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -27,7 +27,10 @@ using XPUKernelSet =
 using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 
 XPUOpMap& get_kp_ops() {
-  static XPUOpMap s_xpu_kp_kernels{};
+  static XPUOpMap s_xpu_kp_kernels{
+      {"elementwise_add",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+  };
 
   return s_xpu_kp_kernels;
 }
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 49391a65b185b45b35edac5d6217a2e4095b4c4a..87aa5dcde626bafd5e605cc9e35de7cf1b589569 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader)
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce phi_dynamic_loader)
 
 list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
 
@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 if(WITH_ROCM)
-  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
+  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader phi_dynload_cuda)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc)
 elseif (WITH_ASCEND_CL)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl phi_dynload_warpctc)
 else()
-  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
+  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader phi_dynload_cuda)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc)
 endif()
 if (WITH_MKLML)
-    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml)
+    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml phi_dynload_mklml)
 endif()
 
-cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack)
+cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader phi_dynload_lapack)
 add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
 if (MKL_FOUND AND WITH_ONEMKL)
   message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt)
+  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader phi_dynload_mklrt)
   target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index dacfe2bd2e7f584847abc4c39114061073770e88..854e5a7b9f04a63e43e4e910c26d4e592651c125 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUPTI
 
 #include <cuda.h>
+#include <cuda_occupancy.h>
 #include <cupti.h>
 #include <mutex>  // NOLINT
 
@@ -50,7 +51,8 @@ namespace dynload {
   __macro(cuptiSubscribe);                    \
   __macro(cuptiUnsubscribe);                  \
   __macro(cuptiEnableCallback);               \
-  __macro(cuptiEnableDomain);
+  __macro(cuptiEnableDomain);                 \
+  __macro(cudaOccMaxActiveBlocksPerMultiprocessor);
 
 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
 
diff --git a/paddle/fluid/platform/dynload/mklrt.h b/paddle/fluid/platform/dynload/mklrt.h
index 3b7d23277e065d007d2049d7a63ed3af7e1fdbdb..334b98a1c3d5ab9442dfd2ca6b7c5d7055e94559 100644
--- a/paddle/fluid/platform/dynload/mklrt.h
+++ b/paddle/fluid/platform/dynload/mklrt.h
@@ -17,7 +17,8 @@ limitations under the License. */
 #include <mkl_dfti.h>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/mklrt.h"
 #include "paddle/phi/backends/dynload/port.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index bc29a0472041afbbff84fa346f4dd0f1535925b6..c2d7eef58236952501020d49695356a1a952bc20 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -37,7 +37,7 @@ void* GetTensorRtPluginHandle();
 extern std::once_flag tensorrt_plugin_dso_flag;
 extern void* tensorrt_plugin_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name)                    \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP_(__name)                   \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
     void* operator()(Args... args) {                                          \
@@ -55,7 +55,7 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)                \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP_(__name)               \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
@@ -72,7 +72,7 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP_(__name)                     \
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
@@ -109,10 +109,10 @@ extern void* tensorrt_plugin_dso_handle;
 #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
   __macro(initLibNvInferPlugins);
 
-TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP_)
 TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
-    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
-TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
+    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP_)
+TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP_)
 
 #endif  // end of NV_TENSORRT_MAJOR
 
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
index f3f7064efeeb2e1121c09a29473a4a81a063f849..abc427a3ca8815ecf193e4f9213223aa79069ea5 100644
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -13,136 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace paddle {
 namespace platform {
 
 template <typename DeviceContext>
-struct ForRange {
-  ForRange(const DeviceContext& dev_ctx, size_t limit);
-
-  template <typename Function>
-  void operator()(Function func) const;
-};
-
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<CPUDeviceContext> {
-  ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    for (size_t i = 0; i < limit_; ++i) {
-      func(i);
-    }
-  }
-
-  size_t limit_;
-};
-
-template <>
-struct ForRange<phi::CPUContext> {
-  ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    for (size_t i = 0; i < limit_; ++i) {
-      func(i);
-    }
-  }
-
-  size_t limit_;
-};
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename Function>
-__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
-  size_t idx = static_cast<size_t>(threadIdx.x);
-  func(idx);
-}
-
-template <typename Function>
-__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
-  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
-  if (idx < limit) {
-    func(idx);
-  }
-}
-
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<CUDADeviceContext> {
-  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
-
-  template <typename Function>
-  inline void operator()(Function func) const {
-#ifdef __HIPCC__
-    // HIP will throw core dump when threads > 256
-    constexpr int num_threads = 256;
-#elif WITH_NV_JETSON
-    // JETSON_NANO will throw core dump when threads > 128
-    int num_thread = 256;
-    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
-    const int num_threads = num_thread;
-#else
-    constexpr int num_threads = 1024;
-#endif
-    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
-    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
-    } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
-    }
-  }
-
-  const CUDADeviceContext& dev_ctx_;
-  size_t limit_;
-};
-
-template <>
-struct ForRange<phi::GPUContext> {
-  ForRange(const phi::GPUContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
-
-  template <typename Function>
-  inline void operator()(Function func) const {
-#ifdef __HIPCC__
-    // HIP will throw core dump when threads > 256
-    constexpr int num_threads = 256;
-#elif WITH_NV_JETSON
-    // JETSON_NANO will throw core dump when threads > 128
-    int num_thread = 256;
-    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
-    const int num_threads = num_thread;
-#else
-    constexpr int num_threads = 1024;
-#endif
-    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
-    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
-    } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
-    }
-  }
-
-  const phi::GPUContext& dev_ctx_;
-  size_t limit_;
-};
-
-#endif
+using ForRange = phi::funcs::ForRange<DeviceContext>;
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 285c6a4c130530987f3f63b1eecdf2ed1593ef09..01de7349f4823a66b2d180f3d1493477f361273a 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -1056,7 +1056,7 @@ class ReorderMKLDNNHandler {
                                                  platform::Place place) {
     auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt);
     auto dst_data = output->mutable_data(
-        place, framework::TransToPtenDataType(vtype_dst_), dst_md.get_size());
+        place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size());
     return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
   }
 
@@ -1065,7 +1065,7 @@ class ReorderMKLDNNHandler {
       const MKLDNNMemoryFormat& fmt, platform::Place place) {
     auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt);
     auto dst_data = output->mutable_data(
-        place, framework::TransToPtenDataType(vtype_dst_), dst_md.get_size());
+        place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size());
     return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
   }
 
diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc
index 58d37783d059709417707a767f306c20d2c65b67..36dd7891d5518681140a86215cb6f0792ee1bdd7 100644
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -95,8 +95,6 @@ std::unordered_map<uint64_t, ThreadId> GetAllThreadIds() {
   return res;
 }
 
-static constexpr const char* kDefaultThreadName = "unset";
-
 std::string GetCurrentThreadName() {
   const auto& thread_name =
       internal::ThreadDataRegistry<std::string>::GetInstance()
@@ -112,7 +110,7 @@ std::unordered_map<uint64_t, std::string> GetAllThreadNames() {
 bool SetCurrentThreadName(const std::string& name) {
   auto& instance = internal::ThreadDataRegistry<std::string>::GetInstance();
   const auto& cur_name = instance.GetCurrentThreadData();
-  if (!cur_name.empty() || cur_name == kDefaultThreadName) {
+  if (!cur_name.empty() || name.empty() || name == kDefaultThreadName) {
     return false;
   }
   instance.SetCurrentThreadData(name);
diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h
index 7f607aaec9763dfe3d76998517b2114218de5e5f..ef894fd3dc28174e01412cfbda83e58482f6ab6d 100644
--- a/paddle/fluid/platform/os_info.h
+++ b/paddle/fluid/platform/os_info.h
@@ -57,7 +57,8 @@ ThreadId GetCurrentThreadId();
 // create/destory when using it.
 std::unordered_map<uint64_t, ThreadId> GetAllThreadIds();
 
-// Returns 'unset' if SetCurrentThreadName is never called.
+static constexpr const char* kDefaultThreadName = "unset";
+// Returns kDefaultThreadName if SetCurrentThreadName is never called.
 std::string GetCurrentThreadName();
 
 // Return the map from StdTid to ThreadName
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
old mode 100644
new mode 100755
index 320e989bd9bb1881e7f1ad0d6d5506fb6e313e24..5acdfa39569f037fb0db5fbb0037f6ce42d2bac0
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -1,8 +1,11 @@
 cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
 cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
-cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer)
 cc_library(event_node SRCS event_node.cc DEPS enforce)
-cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node)
+cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
+cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils)
 cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
-cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
 add_subdirectory(dump)
+cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
+cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
+cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization)
+cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
new file mode 100644
index 0000000000000000000000000000000000000000..672a9a154535a1cb76a4bbc2bde074b6eecefd9e
--- /dev/null
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/profiler/cpu_utilization.h"
+
+namespace paddle {
+namespace platform {
+
+#ifdef _MSC_VER
+static uint64_t FileTimeToUint64(FILETIME time) {
+  uint64_t low_part = time.dwLowDateTime;
+  uint64_t high_part = time.dwHighDateTime;
+  uint64_t result = (high_part << 32) | low_part;
+  return result;
+}
+#endif
+
+void CpuUtilization::RecordBeginTimeInfo() {
+#if defined(_MSC_VER)
+  HANDLE process_handle = GetCurrentProcess();
+  GetSystemTimeAsFileTime(&start_);
+  GetSystemTimes(&system_idle_time_start_, &system_kernel_time_start_,
+                 &system_user_time_start_);
+  GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_,
+                  &process_kernel_time_start_, &process_user_time_start_);
+
+#elif defined(__linux__)
+  start_ = times(&process_tms_start_);
+#define proc_path_size 1024
+  static char proc_stat_path[proc_path_size] = "/proc/stat";
+  FILE *stat_file = fopen(proc_stat_path, "r");
+  if (stat_file != nullptr) {
+    char temp_str[200];
+    uint64_t temp_lu;
+    while (true) {
+      int retval = fscanf(
+          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+          temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
+          &system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
+          &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
+      if (std::string(temp_str).find("cpu") != 0) {
+        break;
+      }
+      if (retval != 11) {
+        return;
+      }
+    }
+    fclose(stat_file);
+  }
+#else
+#endif
+}
+
+void CpuUtilization::RecordEndTimeInfo() {
+#if defined(_MSC_VER)
+  HANDLE process_handle = GetCurrentProcess();
+  GetSystemTimeAsFileTime(&end_);
+  GetSystemTimes(&system_idle_time_end_, &system_kernel_time_end_,
+                 &system_user_time_end_);
+  GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_,
+                  &process_kernel_time_end_, &process_user_time_end_);
+#elif defined(__linux__)
+  end_ = times(&process_tms_end_);
+#define proc_path_size 1024
+  static char proc_stat_path[proc_path_size] = "/proc/stat";
+  FILE *stat_file = fopen(proc_stat_path, "r");
+  if (stat_file != nullptr) {
+    char temp_str[200];
+    uint64_t temp_lu;
+    while (true) {
+      int retval = fscanf(
+          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+          temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
+          &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
+          &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
+      if (std::string(temp_str).find("cpu") != 0) {
+        break;
+      }
+      if (retval != 11) {
+        return;
+      }
+    }
+    fclose(stat_file);
+  }
+#else
+#endif
+}
+
+float CpuUtilization::GetCpuUtilization() {
+  float cpu_utilization = 0.0;
+#if defined(_MSC_VER)
+  uint64_t system_user_time_start = FileTimeToUint64(system_user_time_start_);
+  uint64_t system_user_time_end = FileTimeToUint64(system_user_time_end_);
+  uint64_t system_kernel_time_start =
+      FileTimeToUint64(system_kernel_time_start_);
+  uint64_t system_kernel_time_end = FileTimeToUint64(system_kernel_time_end_);
+  uint64_t system_idle_time_start = FileTimeToUint64(system_idle_time_start_);
+  uint64_t system_idle_time_end = FileTimeToUint64(system_idle_time_end_);
+  float busy_time = (system_kernel_time_end - system_kernel_time_start) +
+                    (system_user_time_end - system_user_time_start);
+  float idle_time = system_idle_time_end - system_idle_time_start;
+  cpu_utilization = busy_time / (busy_time + idle_time);
+
+#elif defined(__linux__)
+  float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) +
+                    (system_tms_end_.tms_stime - system_tms_start_.tms_stime) +
+                    (nice_time_end_ - nice_time_start_) +
+                    (irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) +
+                    (steal_end_ - steal_start_);
+  float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_);
+  cpu_utilization = busy_time / (busy_time + idle_time);
+#else
+  LOG(WARNING)
+      << "Current System is not supported to get system cpu utilization"
+      << cpu_utilization << std::endl;
+#endif
+  return cpu_utilization;
+}
+
+float CpuUtilization::GetCpuCurProcessUtilization() {
+  float cpu_process_utilization = 0.0;
+#ifdef _MSC_VER
+  uint64_t process_user_time_start = FileTimeToUint64(process_user_time_start_);
+  uint64_t process_user_time_end = FileTimeToUint64(process_user_time_end_);
+  uint64_t process_kernel_time_start =
+      FileTimeToUint64(process_kernel_time_start_);
+  uint64_t process_kernel_time_end = FileTimeToUint64(process_kernel_time_end_);
+  uint64_t start = FileTimeToUint64(start_);
+  uint64_t end = FileTimeToUint64(end_);
+  float busy_time = (process_kernel_time_end - process_kernel_time_start) +
+                    (process_user_time_end - process_user_time_start);
+  cpu_process_utilization = busy_time / (end - start);
+  LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl;
+#elif defined(__linux__)
+  float busy_time =
+      (process_tms_end_.tms_utime - process_tms_start_.tms_utime) +
+      (process_tms_end_.tms_stime - process_tms_start_.tms_stime);
+  cpu_process_utilization = busy_time / (end_ - start_);
+#else
+  LOG(WARNING)
+      << "Current System is not supported to get process cpu utilization"
+      << cpu_process_utilization << std::endl;
+#endif
+  return cpu_process_utilization;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.h b/paddle/fluid/platform/profiler/cpu_utilization.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b05a6302cdb0628ad526b3fc6ae18fcb8df619b
--- /dev/null
+++ b/paddle/fluid/platform/profiler/cpu_utilization.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdio.h>
+#include <cinttypes>
+#include <cstdint>
+#include "glog/logging.h"
+#ifdef _MSC_VER
+#include <windows.h>
+#else
+#include <sys/times.h>
+#include <unistd.h>
+#endif
+
+namespace paddle {
+namespace platform {
+
+class CpuUtilization {
+ public:
+  CpuUtilization() {}
+  void RecordBeginTimeInfo();
+  void RecordEndTimeInfo();
+  float GetCpuUtilization();
+  float GetCpuCurProcessUtilization();
+
+ private:
+#ifdef _MSC_VER
+  FILETIME start_, end_;
+  FILETIME process_user_time_start_, process_user_time_end_;
+  FILETIME process_kernel_time_start_, process_kernel_time_end_;
+  FILETIME system_user_time_start_, system_user_time_end_;
+  FILETIME system_kernel_time_start_, system_kernel_time_end_;
+  FILETIME system_idle_time_start_, system_idle_time_end_;
+  FILETIME process_creation_time_, process_exit_time_;
+#else
+  clock_t start_, end_;
+  uint64_t idle_start_, idle_end_;
+  uint64_t iowait_start_, iowait_end_;
+  uint64_t nice_time_start_, nice_time_end_;
+  uint64_t irq_start_, irq_end_;
+  uint64_t softirq_start_, softirq_end_;
+  uint64_t steal_start_, steal_end_;
+  struct tms system_tms_start_, system_tms_end_;
+  struct tms process_tms_start_, process_tms_end_;
+#endif
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/extra_info.h b/paddle/fluid/platform/profiler/extra_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..04532592ebd30793d7707e03b96c07c8e4dc4b1e
--- /dev/null
+++ b/paddle/fluid/platform/profiler/extra_info.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/platform/profiler/utils.h"
+
+namespace paddle {
+namespace platform {
+
+class ExtraInfo {
+ public:
+  ExtraInfo() {}
+  template <typename... Args>
+  void AddExtraInfo(const std::string& key, const std::string& format,
+                    Args... args);
+  void Clear() { extra_info_.clear(); }
+  std::unordered_map<std::string, std::string> GetExtraInfo() {
+    return extra_info_;
+  }
+
+ private:
+  std::unordered_map<std::string, std::string> extra_info_;
+};
+
+template <typename... Args>
+void ExtraInfo::AddExtraInfo(const std::string& key, const std::string& format,
+                             Args... args) {
+  std::string value = string_format(format, args...);
+  extra_info_[key] = value;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index 2172fe4d1e3d5786492ea8741b5e50146648e59d..afd1c2b3012d46100dbead81792108cffb52e9a3 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -14,9 +14,16 @@
 
 #include "paddle/fluid/platform/profiler/host_tracer.h"
 #include "glog/logging.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
 #include "paddle/fluid/platform/profiler/host_event_recorder.h"
 
+// Used to filter events, works like glog VLOG(level).
+// RecordEvent will works if host_trace_level >= level.
+PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 2,
+                             "RecordEvent will works "
+                             "if host_trace_level >= level.");
+
 namespace paddle {
 namespace platform {
 
@@ -26,6 +33,9 @@ void ProcessHostEvents(const HostEventSection& host_events,
                        TraceEventCollector* collector) {
   for (const auto& thr_sec : host_events.thr_sections) {
     uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
     for (const auto& evt : thr_sec.events) {
       HostTraceEvent event;
       event.name = evt.name;
@@ -41,12 +51,18 @@ void ProcessHostEvents(const HostEventSection& host_events,
 
 }  // namespace
 
+void HostTracer::PrepareTracing() {
+  // warm up
+  HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
+  state_ = TracerState::READY;
+}
+
 void HostTracer::StartTracing() {
   PADDLE_ENFORCE_EQ(
       state_ == TracerState::READY || state_ == TracerState::STOPED, true,
       platform::errors::PreconditionNotMet("TracerState must be READY"));
   HostEventRecorder::GetInstance().GatherEvents();
-  HostTraceLevel::GetInstance().SetLevel(trace_level_);
+  HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
   state_ = TracerState::STARTED;
 }
 
diff --git a/paddle/fluid/platform/profiler/host_tracer.h b/paddle/fluid/platform/profiler/host_tracer.h
index b6c10e558b787cd84e760fb892bd75ebace90c3c..d05e829357f884b45cffae8cee7f7b627a366359 100644
--- a/paddle/fluid/platform/profiler/host_tracer.h
+++ b/paddle/fluid/platform/profiler/host_tracer.h
@@ -45,9 +45,9 @@ struct HostTracerOptions {
 
 class HostTracer : public TracerBase {
  public:
-  explicit HostTracer(const HostTracerOptions& options) {
-    trace_level_ = options.trace_level;
-  }
+  explicit HostTracer(const HostTracerOptions& options) : options_(options) {}
+
+  void PrepareTracing() override;
 
   void StartTracing() override;
 
@@ -56,7 +56,7 @@ class HostTracer : public TracerBase {
   void CollectTraceData(TraceEventCollector* collector) override;
 
  private:
-  uint32_t trace_level_;
+  HostTracerOptions options_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/output_logger.h b/paddle/fluid/platform/profiler/output_logger.h
index ff4effad5ecc414e70b99b0cd996c5ea402c7e3a..05a68cf2a4a8debf482cd9226f1226d3679f62a1 100644
--- a/paddle/fluid/platform/profiler/output_logger.h
+++ b/paddle/fluid/platform/profiler/output_logger.h
@@ -33,7 +33,6 @@ class BaseLogger {
   virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
   virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
   virtual void LogNodeTrees(const NodeTrees&) {}
-  virtual void LogMetaInfo() {}
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index de5a0cc9be4ede29ac70409edaac5541c53c5c96..4fc1c6daf96c7f30cbd549b23b1a8f23563bc590 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -23,11 +23,13 @@
 #include "paddle/fluid/platform/profiler/event_node.h"
 #include "paddle/fluid/platform/profiler/tracer_base.h"
 
+DECLARE_int64(host_trace_level);
+
 namespace paddle {
 namespace platform {
 
 struct ProfilerOptions {
-  uint32_t trace_level = 0;
+  uint32_t trace_level = FLAGS_host_trace_level;
 };
 
 class Profiler {
diff --git a/paddle/fluid/platform/profiler/test_extra_info.cc b/paddle/fluid/platform/profiler/test_extra_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7274c9de977e982fa0c00015cafe07ff1d0a9d54
--- /dev/null
+++ b/paddle/fluid/platform/profiler/test_extra_info.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
+
+using paddle::platform::ExtraInfo;
+
+TEST(ExtraInfoTest, case0) {
+  ExtraInfo instance;
+  instance.AddExtraInfo(std::string("info1"), std::string("%d"), 20);
+  instance.AddExtraInfo(std::string("info2"), std::string("%s"), "helloworld");
+  std::unordered_map<std::string, std::string> map = instance.GetExtraInfo();
+  EXPECT_EQ(map["info1"], "20");
+  EXPECT_EQ(map["info2"], "helloworld");
+  EXPECT_EQ(map.size(), 2u);
+  instance.Clear();
+  map = instance.GetExtraInfo();
+  EXPECT_EQ(map.size(), 0u);
+}
diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h
index 30b32220d9f845e5c03e058b72224194bf769b76..cc85a178d14e57c1e1523e794f0016afb5714299 100644
--- a/paddle/fluid/platform/profiler/trace_event_collector.h
+++ b/paddle/fluid/platform/profiler/trace_event_collector.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <list>
+#include <string>
+#include <unordered_map>
 #include "paddle/fluid/platform/profiler/trace_event.h"
 
 namespace paddle {
@@ -32,6 +34,10 @@ class TraceEventCollector {
     device_events_.push_back(event);
   }
 
+  void AddThreadName(uint64_t tid, const std::string& name) {
+    thread_names_[tid] = name;
+  }
+
   const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
 
   const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
@@ -42,7 +48,12 @@ class TraceEventCollector {
     return device_events_;
   }
 
+  const std::unordered_map<uint64_t, std::string>& ThreadNames() const {
+    return thread_names_;
+  }
+
  private:
+  std::unordered_map<uint64_t, std::string> thread_names_;
   std::list<HostTraceEvent> host_events_;
   std::list<RuntimeTraceEvent> runtime_events_;
   std::list<DeviceTraceEvent> device_events_;
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b43389866c7a8150846bef874f49bd72907f446f
--- /dev/null
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler/utils.h"
+
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/dynload/cupti.h"
+
+namespace paddle {
+namespace platform {
+#ifdef PADDLE_WITH_CUPTI
+float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
+                            int32_t StaticSharedMemory,
+                            int32_t DynamicSharedMemory, int32_t BlockX,
+                            int32_t BlockY, int32_t BlockZ, float BlocksPerSm) {
+  float occupancy = 0.0;
+  std::vector<int> device_ids = GetSelectedDevices();
+  if (DeviceId < device_ids.size()) {
+    const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId);
+    cudaOccFuncAttributes occFuncAttr;
+    occFuncAttr.maxThreadsPerBlock = INT_MAX;
+    occFuncAttr.numRegs = RegistersPerThread;
+    occFuncAttr.sharedSizeBytes = StaticSharedMemory;
+    occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
+    occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
+    occFuncAttr.maxDynamicSharedSizeBytes = 0;
+    const cudaOccDeviceState occDeviceState = {};
+    int blockSize = BlockX * BlockY * BlockZ;
+    size_t dynamicSmemSize = DynamicSharedMemory;
+    cudaOccResult occ_result;
+    cudaOccDeviceProp prop(device_property);
+    cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
+        &occ_result, &prop, &occFuncAttr, &occDeviceState, blockSize,
+        dynamicSmemSize);
+    if (status == CUDA_OCC_SUCCESS) {
+      if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) {
+        BlocksPerSm = occ_result.activeBlocksPerMultiprocessor;
+      }
+      occupancy =
+          BlocksPerSm * blockSize /
+          static_cast<float>(device_property.maxThreadsPerMultiProcessor);
+    } else {
+      LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
+                   << status << std::endl;
+    }
+  }
+  return occupancy;
+}
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index 04014b972c3e3599beef0a60635fa122a153233f..cd56d343842686abc31343effc93cf1a4887411c 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <ctime>
+#include <string>
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
 namespace paddle {
@@ -42,5 +45,11 @@ static std::string GetStringFormatLocalTime() {
 
 static int64_t nsToUs(int64_t ns) { return ns / 1000; }
 
+#ifdef PADDLE_WITH_CUPTI
+float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
+                            int32_t staticSharedMemory,
+                            int32_t dynamicSharedMemory, int32_t blockX,
+                            int32_t blockY, int32_t blockZ, float blocksPerSm);
+#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 49690d1c66be74090c684d09f50e6c0d7b67d787..6f714a677033bb87d1a221f62baffa1112726571 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -59,7 +59,7 @@ struct Transform {
                   BinaryOperation op);
 };
 
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
+// NOTE: After the phi kernel is migrated, it needs to be deleted.
 template <>
 struct Transform<platform::CPUDeviceContext> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index e76183192bcee517279afe7ba5832af3b2e3d84b..1f06eda8a2ee5dc8322b5e16e1f7eb2e0703f9a8 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool fleet_executor global_utils pten_utils tcp_store)
+  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -299,7 +299,7 @@ if(WITH_PYTHON)
   if(NOT ON_INFER)
     cc_library(paddle_eager
     SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python)
+    DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python)
     add_dependencies(paddle_eager eager_codegen)
     add_dependencies(paddle_eager eager_op_function_generator_cmd)
     list(APPEND PYBIND_DEPS paddle_eager)
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index d9a2dcb6869096a5f08675bb6dc7994cc8c9889b..1052f93d32ec3cb626577c4b584cc6172c83da2e 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -75,7 +75,7 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
     std::shared_ptr<phi::DenseTensor> dense_tensor =
         std::make_shared<phi::DenseTensor>(
             phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-            phi::DenseTensorMeta(paddle::framework::TransToPtenDataType(dtype),
+            phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
                                  ddims));
     if (phi::product(ddims) > 0) {
       dense_tensor->mutable_data(place);
@@ -133,7 +133,7 @@ void InitTensorWithTensor(TensorObject* self,
     VLOG(4) << "Same place, do ShareDataWith";
   } else {
     self->tensor.set_impl(
-        src.copy_to(phi::TransToPtenBackend(place), true).impl());
+        src.copy_to(phi::TransToPhiBackend(place), true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
   if (src.get_autograd_meta()) {
@@ -157,7 +157,7 @@ void InitTensorWithFrameworkTensor(TensorObject* self,
     auto temp =
         paddle::experimental::Tensor(std::make_shared<phi::DenseTensor>(src));
     self->tensor.set_impl(
-        temp.copy_to(phi::TransToPtenBackend(place), true).impl());
+        temp.copy_to(phi::TransToPhiBackend(place), true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
   egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index b825e9265a8cd8b080df7fd316b33007c2445384..0b04dc7347ce78f87d6f8d81e30eb4135fd965ed 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -135,7 +135,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
 
-  dst = src.copy_to(phi::TransToPtenBackend(place), blocking);
+  dst = src.copy_to(phi::TransToPhiBackend(place), blocking);
   egr::EagerUtils::autograd_meta(&dst)->SetStopGradient(
       egr::EagerUtils::autograd_meta(&(src))->StopGradient());
   egr::EagerUtils::autograd_meta(&dst)->SetPersistable(
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 4e900ae2ffbc11c4c0859ff65cf2b21048b3a649..f11a2ab2517fb481f184c9b68b2558c999d88ec9 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -35,6 +37,82 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
+namespace py = ::pybind11;
+
+class PyTensorHook : public egr::TensorHook {
+ public:
+  explicit PyTensorHook(PyObject* func) : py_func_(func) {
+    Py_INCREF(py_func_);
+  }
+
+  ~PyTensorHook() {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(py_func_);
+  }
+
+  paddle::experimental::Tensor operator()(
+      const paddle::experimental::Tensor& var) override {
+    py::gil_scoped_acquire gil;
+    VLOG(3) << "Call PyTensorHook for var " << var.name();
+
+    PyObject* res = nullptr;
+    try {
+      res = PyObject_CallFunctionObjArgs(py_func_, ToPyObject(var), nullptr);
+    } catch (platform::EnforceNotMet& e) {
+      throw std::move(e);
+    } catch (std::exception& e) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Hook function of Tensor raises an exception: %s.", e.what()));
+    } catch (...) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Hook function of Tensor raises an unknown exception."));
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(res,
+                            platform::errors::Unavailable(
+                                "Hook function of Tensor return a nullptr."));
+    if (res == Py_None) {
+      return var;
+    }
+    return reinterpret_cast<TensorObject*>(res)->tensor;
+  }
+
+ private:
+  PyObject* py_func_;
+};
+
+class PyTensorVoidHook : public egr::TensorVoidHook {
+ public:
+  explicit PyTensorVoidHook(PyObject* func) : py_func_(func) {
+    Py_INCREF(py_func_);
+  }
+
+  ~PyTensorVoidHook() {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(py_func_);
+  }
+
+  void operator()() override {
+    py::gil_scoped_acquire gil;
+    VLOG(3) << "Call PyTensorVoidHook";
+
+    try {
+      PyObject_CallFunctionObjArgs(py_func_, nullptr);
+    } catch (platform::EnforceNotMet& e) {
+      throw std::move(e);
+    } catch (std::exception& e) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Hook function of Tensor raises an exception: %s.", e.what()));
+    } catch (...) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Hook function of Tensor raises an unknown exception."));
+    }
+  }
+
+ private:
+  PyObject* py_func_;
+};
+
 extern void InitTensorWithNumpyValue(TensorObject* self,
                                      const pybind11::object& array,
                                      bool zero_copy);
@@ -113,7 +191,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
   auto cp_tensor =
-      self->tensor.copy_to(phi::TransToPtenBackend(place), blocking);
+      self->tensor.copy_to(phi::TransToPhiBackend(place), blocking);
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
   egr::EagerUtils::autograd_meta(&cp_tensor)
       ->SetPersistable(
@@ -403,6 +481,92 @@ static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args,
+                                           PyObject* kwargs) {
+  EAGER_TRY
+  int64_t hook_id;
+  if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
+    VLOG(6) << "Register hook for leaf tensor: " << self->tensor.name();
+    std::shared_ptr<egr::GradNodeBase> grad_node =
+        egr::EagerUtils::grad_node(self->tensor);
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node,"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation."));
+    auto rank_info =
+        egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo();
+
+    PyObject* hook_func = PyTuple_GET_ITEM(args, 0);
+
+    auto accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    hook_id = accumulation_grad_node->RegisterGradientHook(
+        rank_info.first, rank_info.second,
+        std::make_shared<PyTensorHook>(hook_func));
+
+  } else {
+    VLOG(6) << "Register hook for non leaf tensor: " << self->tensor.name();
+    std::shared_ptr<egr::GradNodeBase> grad_node =
+        egr::EagerUtils::grad_node(self->tensor);
+    auto rank_info =
+        egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo();
+
+    PyObject* hook_func = PyTuple_GET_ITEM(args, 0);
+
+    hook_id = grad_node->RegisterGradientHook(
+        rank_info.first, rank_info.second,
+        std::make_shared<PyTensorHook>(hook_func));
+  }
+  return ToPyObject(hook_id);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_remove_grad_hook(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  VLOG(6) << "Remove the registered hook for tensor: " << self->tensor.name();
+  std::shared_ptr<egr::GradNodeBase> grad_node =
+      egr::EagerUtils::grad_node(self->tensor);
+
+  int64_t hook_id = pybind::CastPyArg2AttrLong(PyTuple_GET_ITEM(args, 0), 0);
+
+  return ToPyObject(grad_node->RemoveGradientHook(hook_id));
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  VLOG(4) << "Register reduce hook for tensor: " << self->tensor.name();
+
+  std::shared_ptr<egr::GradNodeBase> grad_node =
+      egr::EagerUtils::grad_node(self->tensor);
+  PADDLE_ENFORCE_EQ(egr::egr_utils_api::IsLeafTensor(self->tensor), true,
+                    platform::errors::InvalidArgument(
+                        "Only can register backward hook for leaf Tensor."));
+  PADDLE_ENFORCE_EQ(
+      !egr::EagerUtils::unsafe_autograd_meta(self->tensor)->StopGradient(),
+      true, platform::errors::InvalidArgument(
+                "Cannot register backward hook on a Tensor that stop "
+                "gradient."));
+  PADDLE_ENFORCE(
+      grad_node.get() != nullptr,
+      paddle::platform::errors::Fatal("Detected NULL grad_node,"
+                                      "Leaf tensor should have had grad_node "
+                                      "with type: GradNodeAccumulation."));
+  PyObject* hook_func = PyTuple_GET_ITEM(args, 0);
+
+  auto accumulation_grad_node =
+      std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+  accumulation_grad_node->RegisterReduceHook(
+      std::make_shared<PyTensorVoidHook>(hook_func));
+
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -440,6 +604,14 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_register_grad_hook",
+     (PyCFunction)(void (*)(void))tensor_register_grad_hook,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_remove_grad_hook", (PyCFunction)(void (*)(void))tensor_remove_grad_hook,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_register_backward_hook",
+     (PyCFunction)(void (*)(void))tensor_register_reduce_hook,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 4fe47d5a8427d11f560e73990ea8bad7bae7a929..c15c171799f4421fc3e8b40a84abdbb062709dc7 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -32,7 +32,7 @@
 #endif
 #include "paddle/fluid/pybind/op_function_generator.h"
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 // clang-format off
@@ -365,9 +365,9 @@ GenerateOpFunctions() {
     auto& op_type = op_proto->type();
     // Skip ooerator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
-    // if the pten lib contains op kernel, we still generate ops method
+    // if the phi lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
-        !phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+        !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) {
       continue;
     }
     std::string func_name = "eager_api_" + op_type;
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index 14e4fac7cdd95ac3b33d64741c4b2f461a7225be..8283a249ded4c0c790add73573621252bc8954d8 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -15,7 +15,7 @@
 #include <string>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/phi/core/compat/op_utils.h"
 #include "paddle/phi/core/kernel_factory.h"
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index cbbe56985b2adaab0a4a33214132066332cdcd79..9d5bcfac494cba0c550cf7f2751f485b689473b9 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -32,7 +32,7 @@
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #endif
 
-// pten
+// phi
 #include "paddle/phi/kernels/declarations.h"
 
 // NOTE(pangyoki): Inplace OP with duplicable input.
@@ -400,9 +400,9 @@ GenerateOpFunctions() {
     auto& op_type = op_proto->type();
     // Skip operator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
-    // if the pten lib contains op kernel, we still generate ops method
+    // if the phi lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
-        !phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+        !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) {
       continue;
     }
 
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index 629dc2c4037e73b3dfd76126e14bb34c985e38ce..e8c338b3fd18801356e2f5474dc80e0150c40dce 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -44,8 +44,6 @@ void BindPSGPUWrapper(py::module* m) {
       .def("set_slot_offset_vector",
            &framework::PSGPUWrapper::SetSlotOffsetVector,
            py::call_guard<py::gil_scoped_release>())
-      .def("init_GPU_server", &framework::PSGPUWrapper::InitializeGPUServer,
-           py::call_guard<py::gil_scoped_release>())
       .def("set_date", &framework::PSGPUWrapper::SetDate,
            py::call_guard<py::gil_scoped_release>())
       .def("set_dataset", &framework::PSGPUWrapper::SetDataset,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 958174420570e29aa4d48bc145353f4883f83d1a..6e553ad2e60e292881fa8bb0294ea2a247656b67 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -50,8 +50,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/save_load_util.h"
 #include "paddle/fluid/framework/scope_pool.h"
@@ -464,7 +464,7 @@ static void inline CreateVariableIfNotExit(
         tensor_temp->Resize(phi::make_ddim(var_desc.GetShape()));
         tensor_temp->mutable_data(
             exe->GetPlace(),
-            framework::TransToPtenDataType(var_desc.GetDataType()));
+            framework::TransToPhiDataType(var_desc.GetDataType()));
       }
     }
   } else {
@@ -671,60 +671,60 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_get_use_default_grad_op_desc_maker_ops",
         [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); });
 
-  m.def(
-      "_get_all_register_op_kernels",
-      [](const std::string &lib) {
-        std::unordered_map<std::string, std::vector<std::string>>
-            all_kernels_info;
-        if (lib == "fluid" || lib == "all") {
-          auto &all_kernels =
-              paddle::framework::OperatorWithKernel::AllOpKernels();
-
-          for (auto &kernel_pair : all_kernels) {
-            auto op_type = kernel_pair.first;
-            std::vector<std::string> kernel_types;
-            for (auto &info_pair : kernel_pair.second) {
-              paddle::framework::OpKernelType kernel_type = info_pair.first;
-              kernel_types.emplace_back(
-                  paddle::framework::KernelTypeToString(kernel_type));
+  m.def("_get_all_register_op_kernels",
+        [](const std::string &lib) {
+          std::unordered_map<std::string, std::vector<std::string>>
+              all_kernels_info;
+          if (lib == "fluid" || lib == "all") {
+            auto &all_kernels =
+                paddle::framework::OperatorWithKernel::AllOpKernels();
+
+            for (auto &kernel_pair : all_kernels) {
+              auto op_type = kernel_pair.first;
+              std::vector<std::string> kernel_types;
+              for (auto &info_pair : kernel_pair.second) {
+                paddle::framework::OpKernelType kernel_type = info_pair.first;
+                kernel_types.emplace_back(
+                    paddle::framework::KernelTypeToString(kernel_type));
+              }
+              all_kernels_info.emplace(op_type, kernel_types);
             }
-            all_kernels_info.emplace(op_type, kernel_types);
           }
-        }
-        if (lib == "pten" || lib == "all") {
-          auto pten_kernels = phi::KernelFactory::Instance().kernels();
-          for (auto &kernel_pair : pten_kernels) {
-            auto op_type = phi::TransToFluidOpName(kernel_pair.first);
-            std::vector<std::string> kernel_types;
-            for (auto &info_pair : kernel_pair.second) {
-              framework::OpKernelType kernel_type =
-                  framework::TransPtenKernelKeyToOpKernelType(info_pair.first);
-              auto kernel_type_str = framework::KernelTypeToString(kernel_type);
-              if (all_kernels_info.count(op_type)) {
-                if (std::find(all_kernels_info[op_type].begin(),
-                              all_kernels_info[op_type].end(),
-                              kernel_type_str) ==
-                    all_kernels_info[op_type].end()) {
-                  all_kernels_info[op_type].emplace_back(kernel_type_str);
+          if (lib == "phi" || lib == "all") {
+            auto phi_kernels = phi::KernelFactory::Instance().kernels();
+            for (auto &kernel_pair : phi_kernels) {
+              auto op_type = phi::TransToFluidOpName(kernel_pair.first);
+              std::vector<std::string> kernel_types;
+              for (auto &info_pair : kernel_pair.second) {
+                framework::OpKernelType kernel_type =
+                    framework::TransPhiKernelKeyToOpKernelType(info_pair.first);
+                auto kernel_type_str =
+                    framework::KernelTypeToString(kernel_type);
+                if (all_kernels_info.count(op_type)) {
+                  if (std::find(all_kernels_info[op_type].begin(),
+                                all_kernels_info[op_type].end(),
+                                kernel_type_str) ==
+                      all_kernels_info[op_type].end()) {
+                    all_kernels_info[op_type].emplace_back(kernel_type_str);
+                  }
+                } else {
+                  kernel_types.emplace_back(kernel_type_str);
                 }
-              } else {
-                kernel_types.emplace_back(kernel_type_str);
               }
-            }
-            if (!kernel_types.empty()) {
-              all_kernels_info.emplace(op_type, kernel_types);
+              if (!kernel_types.empty()) {
+                all_kernels_info.emplace(op_type, kernel_types);
+              }
             }
           }
-        }
 
-        return all_kernels_info;
-      },
-      py::arg("lib") = "all",
-      R"DOC(
+          return all_kernels_info;
+        },
+        py::arg("lib") = "all",
+        R"DOC(
            Return the registered kernels in paddle.
 
            Args:
-               lib[string]: the libarary, could be 'pten', 'fluid' and 'all'.
+               lib[string]: the libarary, could be 'phi', 'fluid' and 'all'.
            )DOC");
 
   // NOTE(zjl): ctest would load environment variables at the beginning even
@@ -823,39 +823,39 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::CPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::XPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::CUDAPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::CUDAPinnedPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::MLUPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_clear", &framework::Tensor::clear)
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::NPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(self.mutable_data(
-                 place, framework::TransToPtenDataType(type)));
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_copy_from", &TensorCopyFrom<paddle::platform::CPUPlace>,
            py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
@@ -3786,86 +3786,142 @@ All parameter, weight, gradient are variables in Paddle.
 
 #ifdef PADDLE_WITH_IPU
   py::class_<platform::ipu::IpuBackend,
-             std::shared_ptr<platform::ipu::IpuBackend>>(m, "IpuBackend")
-      .def(py::init(&platform::ipu::IpuBackend::GetNewInstance))
-      .def("clear", &platform::ipu::IpuBackend::Clear)
+             std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>>(
+      m, "IpuBackend")
+      // manage IpuBackend in C++
+      .def("get_instance",
+           []() {
+             return std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>(
+                 platform::ipu::IpuBackend::GetInstance());
+           },
+           py::return_value_policy::reference)
+      .def("detach", &platform::ipu::IpuBackend::Detach)
+      .def("reset", &platform::ipu::IpuBackend::Reset)
       .def("set_scope", &platform::ipu::IpuBackend::SetScope)
-      .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy);
-
-  py::class_<platform::ipu::IpuStrategy> ipu_strategy(m, "IpuStrategy");
-  ipu_strategy.def(py::init())
-      .def_property(
-          "num_ipus",
-          [](const platform::ipu::IpuStrategy &self) { return self.num_ipus; },
-          [](platform::ipu::IpuStrategy &self, int num_ipus) {
-            self.num_ipus = num_ipus;
-          })
-      .def_property(
-          "accumulationFactor",
-          [](const platform::ipu::IpuStrategy &self) {
-            return self.popart_options_.accumulationFactor;
-          },
-          [](platform::ipu::IpuStrategy &self, int accumulationFactor) {
-            self.popart_options_.accumulationFactor = accumulationFactor;
-          })
-      .def_property("batches_per_step",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.batches_per_step;
-                    },
-                    [](platform::ipu::IpuStrategy &self, int batches_per_step) {
-                      self.batches_per_step = batches_per_step;
-                    })
-      .def_property("is_training",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.is_training;
-                    },
-                    [](platform::ipu::IpuStrategy &self, bool is_training) {
-                      self.is_training = is_training;
-                    })
-      .def_property(
-          "enable_pipelining",
-          [](const platform::ipu::IpuStrategy &self) {
-            return self.popart_options_.enablePipelining;
-          },
-          [](platform::ipu::IpuStrategy &self, bool enable_pipelining) {
-            self.popart_options_.enablePipelining = enable_pipelining;
-          })
-      .def_property(
-          "enable_manual_shard",
-          [](const platform::ipu::IpuStrategy &self) {
-            return self.popart_options_.virtualGraphMode ==
-                   platform::ipu::VirtualGraphMode::Manual;
-          },
-          [](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) {
-            if (enable_ipu_shard) {
-              self.popart_options_.virtualGraphMode =
-                  platform::ipu::VirtualGraphMode::Manual;
-            } else {
-              self.popart_options_.virtualGraphMode =
-                  platform::ipu::VirtualGraphMode::Off;
-            }
-          })
-      .def_property("need_avg_shard",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.need_avg_shard;
-                    },
-                    [](platform::ipu::IpuStrategy &self, bool need_avg_shard) {
-                      self.need_avg_shard = need_avg_shard;
-                    })
-      .def_property("batch_size",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.batch_size;
-                    },
-                    [](platform::ipu::IpuStrategy &self, int batch_size) {
-                      self.batch_size = batch_size;
-                    })
-      .def_property("enable_fp16",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.enable_fp16;
-                    },
-                    [](platform::ipu::IpuStrategy &self, bool enable_fp16) {
-                      self.enable_fp16 = enable_fp16;
-                    });
+      .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy)
+      .def("save_model_proto", &platform::ipu::IpuBackend::SaveModelProto);
+
+  py::class_<platform::ipu::IpuStrategy>(m, "IpuStrategy")
+      .def(py::init())
+      .def("set_options",
+           [](platform::ipu::IpuStrategy &self, const py::dict &opt) {
+             for (auto element : opt) {
+               auto option_name = element.first.cast<std::string>();
+               VLOG(10) << "Set option: " << option_name;
+               if (py::isinstance<py::bool_>(element.second)) {
+                 self.AddBoolOption(option_name, element.second.cast<bool>());
+               } else if (py::isinstance<py::float_>(element.second)) {
+                 self.AddDoubleOption(option_name,
+                                      element.second.cast<double>());
+               } else if (py::isinstance<py::int_>(element.second)) {
+                 self.AddUint64Option(option_name,
+                                      element.second.cast<std::uint64_t>());
+               } else if (py::isinstance<py::str>(element.second)) {
+                 self.AddStringOption(option_name,
+                                      element.second.cast<std::string>());
+               } else if (py::isinstance<py::set>(element.second) ||
+                          py::isinstance<py::list>(element.second)) {
+                 for (auto option : element.second.cast<py::list>()) {
+                   std::string option_val;
+                   if (py::isinstance<py::str>(option)) {
+                     option_val = option.cast<std::string>();
+                   } else if (py::isinstance<py::int_>(option)) {
+                     option_val = std::to_string(option.cast<std::uint64_t>());
+                   } else {
+                     PADDLE_THROW(platform::errors::Unimplemented(
+                         "Failed to convert type: %s when set IpuStrategy "
+                         "option: %s",
+                         option.get_type(), option_name));
+                   }
+                   self.InsertStringOption(option_name, option_val);
+                 }
+               } else if (py::isinstance<py::dict>(element.second)) {
+                 if (option_name.rfind("location_", 0) == 0) {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     self.SetTensorLocation(
+                         option_name, option.first.cast<std::string>(),
+                         option.second.cast<std::uint64_t>());
+                   }
+                 } else if (option_name == "custom_op") {
+                   std::string paddle_op;
+                   std::string popart_op;
+                   std::string domain;
+                   int version = -1;
+                   for (auto option : element.second.cast<py::dict>()) {
+                     std::string option_key = option.first.cast<std::string>();
+                     if (option_key == "paddle_op") {
+                       paddle_op = option.second.cast<std::string>();
+                     } else if (option_key == "popart_op") {
+                       popart_op = option.second.cast<std::string>();
+                     } else if (option_key == "domain") {
+                       domain = option.second.cast<std::string>();
+                     } else if (option_key == "version") {
+                       version = option.second.cast<int>();
+                     } else {
+                       PADDLE_THROW(platform::errors::InvalidArgument(
+                           "Invalid argument, key must be one of paddle_op, "
+                           "popart_op, domain or version, but revecived %s",
+                           option_key));
+                     }
+                   }
+                   self.AddCustomOp(paddle_op, popart_op, domain, version);
+                 } else {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     std::string option_key = option.first.cast<std::string>();
+                     std::string option_val;
+                     if (py::isinstance<py::str>(option.second)) {
+                       option_val = option.second.cast<std::string>();
+                     } else if (py::isinstance<py::int_>(option.second)) {
+                       option_val =
+                           std::to_string(option.second.cast<std::uint64_t>());
+                     } else {
+                       PADDLE_THROW(platform::errors::Unimplemented(
+                           "Failed to convert value type: %s when set "
+                           "IpuStrategy option: %s",
+                           option.second.get_type(), option_key));
+                     }
+                     self.InsertStringPairOption(option_name, option_key,
+                                                 option_val);
+                   }
+                 }
+               } else {
+                 PADDLE_THROW(platform::errors::InvalidArgument(
+                     "Invalid IpuStrategy option value type: %s, please check "
+                     "input value for option: %s",
+                     element.second.get_type(), option_name));
+               }
+             }
+           })
+      .def("get_option",
+           [](platform::ipu::IpuStrategy &self, const std::string &name) {
+             py::dict res;
+             auto option_type = self.GetOptionType(name);
+             res["name"] = name;
+             res["type"] = option_type;
+             if (option_type == "vector") {
+               auto value = self.GetVectorOption(name);
+               res["value"] = value;
+             } else if (option_type == "map") {
+               auto value = self.GetMapOption(name);
+               res["value"] = value;
+             } else {
+               auto value_s = self.GetOption(name);
+               res["value_s"] = value_s;
+               if (option_type == "bool") {
+                 res["value"] = static_cast<bool>(std::stoi(value_s));
+               } else if (option_type == "uint64") {
+                 res["value"] = std::stoul(value_s);
+               } else if (option_type == "double") {
+                 res["value"] = std::stod(value_s);
+               } else if (option_type == "string") {
+                 res["value"] = value_s;
+               }
+             }
+             return res;
+           })
+      .def("enable_pattern", &platform::ipu::IpuStrategy::EnablePattern)
+      .def("disable_pattern", &platform::ipu::IpuStrategy::DisablePattern)
+      .def("is_pattern_enabled", &platform::ipu::IpuStrategy::IsPatternEnabled);
 #endif
 
   BindFleetWrapper(&m);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 531cc03f26714a6041c1e2c205640b9ea06c440c..e7abd64ec4439611c307440597c7278cabb03ab9 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -324,7 +324,7 @@ void SetTensorFromPyArrayT(
     if (zero_copy) {
       auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
       auto type = framework::ToDataType(std::type_index(typeid(T)));
-      self->ResetHolderWithType(holder, framework::TransToPtenDataType(type));
+      self->ResetHolderWithType(holder, framework::TransToPhiDataType(type));
     } else {
       auto dst = self->mutable_data<T>(place);
       std::memcpy(dst, array.data(), array.nbytes());
@@ -348,10 +348,16 @@ void SetTensorFromPyArrayT(
     if (zero_copy) {
       auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
       auto type = framework::ToDataType(std::type_index(typeid(T)));
-      self->ResetHolderWithType(holder, framework::TransToPtenDataType(type));
+      self->ResetHolderWithType(holder, framework::TransToPhiDataType(type));
     } else {
-      auto dst = self->mutable_data<T>(place);
-      std::memcpy(dst, array.data(), array.nbytes());
+      // IPU does not store Tensor data, Tensor will be created on CPU
+      if (!self->initialized()) {
+        auto dst = self->mutable_data<T>(place);
+        std::memcpy(dst, array.data(), array.nbytes());
+      } else {
+        auto dst = self->mutable_data<T>(self->place());
+        std::memcpy(dst, array.data(), array.nbytes());
+      }
     }
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
@@ -512,7 +518,7 @@ void SetUVATensorFromPyArray(
           cuda_device_pointer, need_allocate_size,
           platform::CUDAPlace(device_id));
   self_tensor->ResetHolderWithType(holder,
-                                   framework::TransToPtenDataType(data_type));
+                                   framework::TransToPhiDataType(data_type));
 #endif
 }
 
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 0f6dfb9d8f44e8be8fd41405ce635dff85ab2044..f2768f3dfa88d3405008baa7662f5e209ca3954c 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -74,6 +74,7 @@ endif()
 
 
 add_subdirectory(api)
+add_subdirectory(backends)
 add_subdirectory(common)
 add_subdirectory(dialect)
 add_subdirectory(host_context)
@@ -96,9 +97,11 @@ set(infrt_mlir_incs
         pd_extra_ops_inc
         rewrite_inc
         trt_ops_inc
+        pd_lower_to_trt_inc
         )
+
 if (INFRT_WITH_PHI)
-    set(phi_libs pten)
+    set(phi_libs phi)
     set(infrt_mlir_incs ${infrt_mlir_incs}
         MLIRinfrt_phi_tensorIncGen
         MLIRinfrt_phi_baseIncGen
diff --git a/paddle/infrt/backends/CMakeLists.txt b/paddle/infrt/backends/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b639f892925685bd61f05a0a5db0f8af0f44070a
--- /dev/null
+++ b/paddle/infrt/backends/CMakeLists.txt
@@ -0,0 +1,3 @@
+if (INFRT_WITH_PHI AND WITH_GPU AND WITH_TENSORRT)
+  add_subdirectory(tensorrt)
+endif()
diff --git a/paddle/infrt/backends/tensorrt/CMakeLists.txt b/paddle/infrt/backends/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cc20c9a2e14b639fbc02b74ab5870188d7e55d63
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_library(infrt_trt SRCS trt_engine.cc DEPS glog phi_dynload_cuda phi)
+
+cc_test_tiny(test_infrt_trt SRCS test_trt_engine.cc DEPS infrt_trt phi_dynload_cuda tensorrt_converter)
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54b7bc3e8af835077fcd2ac00d33b15e4ae3f95c
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -0,0 +1,254 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+const char* model_input = "model_input";
+const char* model_output = "model_output1";
+const char* model_output2 = "model_output2";
+
+TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
+    nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
+  TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  if (is_static_shape) {
+    network.reset(builder->createNetworkV2(0U));
+  } else {
+    auto networkFlags =
+        1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    network.reset(builder->createNetworkV2(networkFlags));
+  }
+
+  ITensor* data =
+      network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims);
+  CHECK_NOTNULL(data);
+  IActivationLayer* act =
+      network->addActivation(*data, ActivationType::kSIGMOID);
+  CHECK_NOTNULL(act);
+  auto* act_out = act->getOutput(0);
+  std::vector<int> output_length{1, 2};
+  int axis;
+  nvinfer1::IPluginV2Layer* split_layer;
+  if (is_static_shape) {
+    axis = 0;
+    paddle::inference::tensorrt::plugin::SplitPlugin plugin(
+        axis, output_length, false);
+    split_layer = network->addPluginV2(&act_out, 1, plugin);
+  } else {
+    axis = 1;
+    paddle::inference::tensorrt::plugin::SplitPluginDynamic plugin(
+        axis, output_length, false);
+    split_layer = network->addPluginV2(&act_out, 1, plugin);
+  }
+
+  split_layer->getOutput(0)->setName(model_output);
+  split_layer->getOutput(1)->setName(model_output2);
+  network->markOutput(*split_layer->getOutput(0));
+  network->markOutput(*split_layer->getOutput(1));
+  return network;
+}
+
+// sigmoid(x) = 1 / (1 + exp(-x))
+inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
+
+TEST(trt, run_static) {
+  TRTEngine static_trt_engine(0);
+  auto net = ConstructNetwork(
+      static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
+  BuildOptions static_build_options;
+  static_build_options.max_batch = 4;
+  static_trt_engine.Build(std::move(net), static_build_options);
+  InferenceOptions inference_options;
+  inference_options.batch = 2;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 3, 28, 28}));
+  phi::DenseTensor input;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 3 * 28 * 28, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  phi::DenseTensor output, output2;
+  std::unordered_map<std::string, phi::DenseTensor*> outputs;
+  outputs.emplace(std::make_pair(model_output, &output));
+  outputs.emplace(std::make_pair(model_output2, &output2));
+
+  static_trt_engine.SetUpInference(inference_options, inputs, &outputs);
+  static_trt_engine.GetEngineInfo();
+  static_trt_engine.Run(context);
+
+  std::vector<float> output_data1(inference_options.batch * 1 * 28 * 28, 0);
+  std::vector<float> output_data2(inference_options.batch * 2 * 28 * 28, 0);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data1.data(),
+                       place,
+                       output.data<float>(),
+                       sizeof(float) * output_data1.size(),
+                       context.stream());
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data2.data(),
+                       place,
+                       output2.data<float>(),
+                       sizeof(float) * output_data2.size(),
+                       context.stream());
+  cudaStreamSynchronize(context.stream());
+
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    int w = i % 28;
+    int h = (i / 28) % 28;
+    int c = i / (28 * 28) % 3;
+    int n = i / (28 * 28 * 3);
+    if (c == 0) {
+      CHECK_NEAR(
+          sigmoid(host_data[i]), output_data1[n * 28 * 28 + h * 28 + w], 1e-5);
+    } else {
+      CHECK_NEAR(sigmoid(host_data[i]),
+                 output_data2[n * 28 * 28 * 2 + (c - 1) * 28 * 28 + h * 28 + w],
+                 1e-5);
+    }
+  }
+}
+
+TEST(trt, run_dynamic) {
+  TRTEngine engine(0);
+  auto net = ConstructNetwork(
+      engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false);
+  BuildOptions build_options;
+  build_options.max_batch = 4;
+  build_options.workspace = 32;
+  // build_options.fp16 = true;
+  std::vector<int32_t> min_shape{1, 3, 16, 16};
+  std::vector<int32_t> opt_shape{2, 3, 28, 28};
+  std::vector<int32_t> max_shape{4, 3, 28, 28};
+  build_options.shapes[model_input][0] = min_shape;
+  build_options.shapes[model_input][1] = opt_shape;
+  build_options.shapes[model_input][2] = max_shape;
+  engine.Build(std::move(net), build_options);
+
+  InferenceOptions inference_options;
+  inference_options.batch = 2;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 3, 16, 16}));
+  phi::DenseTensor input, output, output2;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 3 * 16 * 16, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  std::unordered_map<std::string, phi::DenseTensor*> outputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  outputs.emplace(std::make_pair(model_output, &output));
+  outputs.emplace(std::make_pair(model_output2, &output2));
+
+  engine.SetUpInference(inference_options, inputs, &outputs);
+  engine.GetEngineInfo();
+  engine.Run(context);
+
+  std::vector<float> output_data1(inference_options.batch * 1 * 16 * 16, 0);
+  std::vector<float> output_data2(inference_options.batch * 2 * 16 * 16, 0);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data1.data(),
+                       place,
+                       output.data<float>(),
+                       sizeof(float) * output_data1.size(),
+                       context.stream());
+  paddle::memory::Copy(phi::CPUPlace(),
+                       output_data2.data(),
+                       place,
+                       output2.data<float>(),
+                       sizeof(float) * output_data2.size(),
+                       context.stream());
+  cudaStreamSynchronize(context.stream());
+
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    int w = i % 16;
+    int h = (i / 16) % 16;
+    int c = i / (16 * 16) % 3;
+    int n = i / (16 * 16 * 3);
+    if (c == 0) {
+      CHECK_NEAR(
+          sigmoid(host_data[i]), output_data1[n * 16 * 16 + h * 16 + w], 1e-5);
+    } else {
+      CHECK_NEAR(sigmoid(host_data[i]),
+                 output_data2[n * 16 * 16 * 2 + (c - 1) * 16 * 16 + h * 16 + w],
+                 1e-5);
+    }
+  }
+}
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a204fe42b45080b0ba5526473622f34e4fe4ef41
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -0,0 +1,365 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include "glog/logging.h"
+#include "paddle/phi/backends/dynload/tensorrt.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+// The following two API are implemented in TensorRT's header file, cannot load
+// from the dynamic library. So create our own implementation and directly
+// trigger the method from the dynamic library.
+static nvinfer1::IBuilder* createInferBuilder(
+    nvinfer1::ILogger& logger) {  // NOLINT
+  return static_cast<nvinfer1::IBuilder*>(
+      phi::dynload::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+static nvinfer1::IRuntime* createInferRuntime(
+    nvinfer1::ILogger& logger) {  // NOLINT
+  return static_cast<nvinfer1::IRuntime*>(
+      phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+
+TRTEngine::TRTEngine(int device_id) : device_id_(device_id) {
+  FreshDeviceId();
+  logger_.reset(new TrtLogger());
+  builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
+  phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
+}
+
+nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() {
+  CHECK_NOTNULL(builder_);
+  return builder_.get();
+}
+
+void TRTEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+                      const BuildOptions& build_options) {
+  FreshDeviceId();
+  ModelToBuildEnv(std::move(network), build_options);
+  CHECK_NOTNULL(engine_);
+}
+
+bool TRTEngine::ModelToBuildEnv(
+    TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+    const BuildOptions& build) {
+  CHECK_NOTNULL(builder_);
+  std::swap(network, network_);
+  CHECK_NOTNULL(network_);
+  // ModelToNetwork(network_, logger);
+  NetworkToEngine(build);
+  return true;
+}
+
+bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
+  TrtUniquePtr<IBuilderConfig> config{builder_->createBuilderConfig()};
+  CHECK_NOTNULL(config);
+  CHECK(SetupNetworkAndConfig(build, *network_, *config));
+
+#if IS_TRT_VERSION_LT(8000)
+  engine_.reset(builder_->buildEngineWithConfig(*network_, *config));
+#else
+  serialized_engine_.reset(
+      builder_->buildSerializedNetwork(*network_, *config));
+  CHECK_NOTNULL(serialized_engine_);
+
+  TrtUniquePtr<IRuntime> runtime{createInferRuntime(logger_->GetTrtLogger())};
+  CHECK_NOTNULL(runtime);
+  engine_.reset(runtime->deserializeCudaEngine(serialized_engine_->data(),
+                                               serialized_engine_->size()));
+  CHECK_NOTNULL(engine_);
+#endif
+  return true;
+}
+
+bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
+                                      INetworkDefinition& network,
+                                      IBuilderConfig& config) {
+  builder_->setMaxBatchSize(build.max_batch);
+  // TODO(wilber): handle one engine - multi execution context case.
+  IOptimizationProfile* profile{nullptr};
+  if (!build.shapes.empty()) {
+    profile = builder_->createOptimizationProfile();
+    CHECK_NOTNULL(profile);
+  }
+
+  // Set formats and data types of inputs
+  for (int32_t i = 0; i < network.getNbInputs(); ++i) {
+    auto* input = network.getInput(i);
+    if (!build.input_formats.empty()) {
+      input->setType(build.input_formats[i].first);
+      input->setAllowedFormats(build.input_formats[i].second);
+    } else {
+      switch (input->getType()) {
+        case DataType::kINT32:
+        case DataType::kBOOL:
+        case DataType::kHALF:
+          // Leave these as is.
+          break;
+        case DataType::kFLOAT:
+        case DataType::kINT8:
+          // User did not specify a floating-point format.  Default to kFLOAT.
+          input->setType(DataType::kFLOAT);
+          break;
+      }
+      input->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
+    }
+
+    if (profile) {
+      Dims dims = input->getDimensions();
+      // TODO(wilber): shape tensor.
+      const bool is_dynamic_input = std::any_of(
+          dims.d, dims.d + dims.nbDims, [](int dim) { return dim == -1; });
+      if (is_dynamic_input) {
+        is_dynamic_shape_ = true;
+        auto shape = build.shapes.find(input->getName());
+
+        // If no shape is provided
+        if (shape == build.shapes.end()) {
+          // TODO(wilber): add infomation.
+          CHECK(false);
+        }
+        LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
+        std::vector<int> profile_dims{};
+        profile_dims =
+            shape->second[static_cast<size_t>(OptProfileSelector::kMIN)];
+        CHECK(profile->setDimensions(input->getName(),
+                                     OptProfileSelector::kMIN,
+                                     VecToDims(profile_dims)));
+        profile_dims =
+            shape->second[static_cast<size_t>(OptProfileSelector::kOPT)];
+        CHECK(profile->setDimensions(input->getName(),
+                                     OptProfileSelector::kOPT,
+                                     VecToDims(profile_dims)));
+        profile_dims =
+            shape->second[static_cast<size_t>(OptProfileSelector::kMAX)];
+        CHECK(profile->setDimensions(input->getName(),
+                                     OptProfileSelector::kMAX,
+                                     VecToDims(profile_dims)));
+      }
+    }
+  }
+
+  if (profile && is_dynamic_shape_) {
+    CHECK(profile->isValid());  // Required optimization profile is invalid
+    CHECK_NE(config.addOptimizationProfile(profile), -1);
+  }
+
+  // Set formats and data types of outputs
+  for (int32_t i = 0, n = network.getNbOutputs(); i < n; i++) {
+    auto* output = network.getOutput(i);
+    if (!build.output_formats.empty()) {
+      // int outputFormatIndex = broadcastOutputFormats ? 0 : i;
+      output->setType(build.output_formats[i].first);
+      output->setAllowedFormats(build.output_formats[i].second);
+    } else {
+      output->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
+    }
+  }
+
+  config.setMaxWorkspaceSize(static_cast<size_t>(build.workspace) << 20);
+
+  if (build.fp16) {
+    config.setFlag(BuilderFlag::kFP16);
+    bool support_fp16 = builder_->platformHasFastFp16();
+    if (support_fp16) {
+      LOG(INFO) << "Run INFRT-TRT FP16 mode";
+    } else {
+      LOG(INFO) << "You specify FP16 mode, but the hardware do not support "
+                   "FP16 speed up, use FP32 instead.";
+    }
+  }
+
+  if (build.tf32) {
+    config.setFlag(BuilderFlag::kTF32);
+    bool support_tf32 = builder_->platformHasTf32();
+    if (support_tf32) {
+      LOG(INFO) << "Run INFRT-TRT TF32 mode";
+    } else {
+      LOG(INFO) << "You specify TF32 mode, but the hardware do not support "
+                   "TF32 speed up, use FP32 instead.";
+    }
+  }
+
+  // TODO(wilber): other precision.
+
+  // TODO(wilber): precision config.
+  switch (build.precision_constraints) {
+    case PrecisionConstraints::kNONE:
+      // It's the default for TensorRT.
+      break;
+    case PrecisionConstraints::kOBEY:
+      config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS);
+      break;
+    case PrecisionConstraints::kPREFER:
+      config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
+      break;
+  }
+
+  // TODO(TRT): DLA config.
+
+  // TODO(TRT): int8 config.
+  // TODO(TRT): support int8
+  if (build.int8) {
+    assert(false);
+    config.setFlag(BuilderFlag::kINT8);
+    bool support_int8 = builder_->platformHasFastInt8();
+    if (support_int8) {
+      LOG(INFO) << "Run INFRT-TRT FP16 mode";
+    }
+  }
+
+  // TODO(TRT): calib config.
+
+  // TODO(TRT): sparse config.
+
+  return true;
+}
+
+bool TRTEngine::SetUpInference(
+    const InferenceOptions& inference,
+    const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
+    std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
+  // TODO(wilber): now only create one exec_context
+  FreshDeviceId();
+  CHECK(engine_ != nullptr);
+  nvinfer1::IExecutionContext* ec = engine_->createExecutionContext();
+  CHECK(ec != nullptr);
+  contexts_.emplace_back(ec);
+  bindings_.emplace_back(new Bindings());
+
+  for (const auto& it : inputs) {
+    const int bind_index = engine_->getBindingIndex(it.first.c_str());
+    bindings_.front()->AddBinding(
+        bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT);
+  }
+  for (auto& it : *outputs) {
+    const int bind_index = engine_->getBindingIndex(it.first.c_str());
+    bindings_.front()->AddBinding(
+        bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT);
+  }
+
+  return true;
+}
+
+void TRTEngine::Run(const phi::GPUContext& ctx) {
+  if (is_dynamic_shape_) {
+    DynamicRun(ctx);
+  } else {
+    StaticRun(ctx);
+  }
+}
+
+void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
+  const int num_bindings = engine_->getNbBindings();
+  std::vector<void*> buffers(num_bindings, nullptr);
+
+  int runtime_batch = -1;
+  auto input_binds = bindings_.front()->GetInputBindings();
+  for (auto bind : input_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    buffers[bind_index] =
+        const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>()));
+    if (runtime_batch != -1) {
+      CHECK_EQ(runtime_batch, phi::vectorize<int64_t>(bind.buffer->dims())[0]);
+    }
+    runtime_batch = bind.buffer->dims()[0];
+  }
+
+  auto output_binds = bindings_.front()->GetOutputBindings();
+  for (auto bind : output_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    std::vector<int32_t> ddim;
+    auto dims = engine_->getBindingDimensions(bind_index);
+    ddim.push_back(runtime_batch);
+    for (int i = 0; i < dims.nbDims; ++i) {
+      ddim.push_back(dims.d[i]);
+    }
+    bind.buffer->Resize(phi::make_ddim(ddim));
+    ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
+    buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
+  }
+
+  contexts_.front()->enqueue(
+      runtime_batch, buffers.data(), ctx.stream(), nullptr);
+}
+
+void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
+  const int num_bindings = engine_->getNbBindings();
+  std::vector<void*> buffers(num_bindings, nullptr);
+
+  auto input_binds = bindings_.front()->GetInputBindings();
+  for (auto bind : input_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    buffers[bind_index] =
+        const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>()));
+    nvinfer1::Dims trt_dims;
+    trt_dims.nbDims = bind.buffer->dims().size();
+
+    for (int i = 0; i < trt_dims.nbDims; ++i) {
+      trt_dims.d[i] = bind.buffer->dims()[i];
+    }
+    contexts_.front()->setBindingDimensions(bind_index, trt_dims);
+  }
+
+  CHECK(contexts_.front()->allInputDimensionsSpecified());
+
+  auto output_binds = bindings_.front()->GetOutputBindings();
+  for (auto bind : output_binds) {
+    const int bind_index = engine_->getBindingIndex(bind.name.c_str());
+    auto dims = contexts_.front()->getBindingDimensions(bind_index);
+    std::vector<int32_t> ddim(dims.nbDims);
+    for (int i = 0; i < dims.nbDims; ++i) {
+      ddim[i] = dims.d[i];
+    }
+    bind.buffer->Resize(phi::make_ddim(ddim));
+    ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
+    buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
+  }
+
+  contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr);
+}
+
+void TRTEngine::FreshDeviceId() {
+  int count;
+  cudaGetDeviceCount(&count);
+  CHECK_LT(device_id_, count);
+  phi::backends::gpu::SetDeviceId(device_id_);
+}
+
+void TRTEngine::GetEngineInfo() {
+#if IS_TRT_VERSION_GE(8200)
+  LOG(INFO) << "====== engine info ======";
+  std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
+      engine_->createEngineInspector());
+  infer_inspector->setExecutionContext(contexts_.front().get());
+  LOG(INFO) << infer_inspector->getEngineInformation(
+      nvinfer1::LayerInformationFormat::kONELINE);
+  LOG(INFO) << "====== engine info end ======";
+#else
+  LOG(INFO) << "Inspector needs TensorRT version 8.2 and after.";
+#endif
+}
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..f72bdaf3ac0b463d086e9aeda62823cc725f2db9
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/infrt/backends/tensorrt/trt_utils.h"
+#include "paddle/phi/backends/dynload/tensorrt.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+using namespace nvinfer1;  // NOLINT
+
+// The trt programing model as follows:
+// 1. The build phase:
+// IBuilder* builder = createInferBuilder(&logger_);
+// 2. Create a network definition:
+// INetworkDefinition* network = builder->createNetworkV2(...);
+// 3. Build network:
+// network->AddLayer(...)
+// 4. Configure network:
+// IBuilderConfig* config = builder->createBuilderConfig();
+// config->setMaxWorkspaceSize(...)
+// 5. Get cuda engine and deserializing a plan:
+// IHostMemory* serialized_model = builder->buildSerializedNetwork(...);
+// IRuntime* runtime = createInferRuntime(&logger_);
+// ICudaEngine* engine = runtime->deserializeCudaEngine(...);
+// 6. Get execution context:
+// IExecutionContext* exec_context = engine->createExecutionContext();
+// 7. Set input data:
+// int32_t input_index = engine->getBindingIndex("input");
+// int32_t output_index = engine->getBindingIndex("output");
+// void* buffers[2];
+// buffers[input_index] = input_buffer;
+// buffers[output_index] = output_buffer;
+// 8. Performance inference:
+// exec_context->enqueueV2(buffers, stream, nullptr);
+//
+// We have encapsulated this logic, please use the following programming model.
+//
+// TRTEngine trt_engine;
+// trt_engine.Build(...);
+// trt_engine.SetUpInference(...);
+// trt_engine.Run(...);
+class TRTEngine {
+ public:
+  explicit TRTEngine(int device_id);
+
+  nvinfer1::IBuilder* GetTrtBuilder();
+
+  // TODO(wilber): Modify signature after infrt-trt ready.
+  void Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+             const BuildOptions& build_options);
+
+  // TODO(wilber): Modify signature after infrt-trt ready.
+  void Run(const phi::GPUContext& ctx);
+
+  // TODO(wilber): How to support multiple execution contexts?
+  bool SetUpInference(
+      const InferenceOptions& inference,
+      const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
+      std::unordered_map<std::string, phi::DenseTensor*>* outputs);
+
+  void GetEngineInfo();
+
+ private:
+  void FreshDeviceId();
+
+  bool SetupNetworkAndConfig(const BuildOptions& build,
+                             INetworkDefinition& network,  // NOLINT
+                             IBuilderConfig& config);      // NOLINT
+
+  bool NetworkToEngine(const BuildOptions& build);
+
+  bool ModelToBuildEnv(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+                       const BuildOptions& build);
+
+  void StaticRun(const phi::GPUContext& ctx);
+
+  void DynamicRun(const phi::GPUContext& ctx);
+
+ private:
+  std::unique_ptr<TrtLogger> logger_{nullptr};
+  TrtUniquePtr<nvinfer1::IBuilder> builder_{nullptr};
+  TrtUniquePtr<INetworkDefinition> network_{nullptr};
+  std::unique_ptr<IHostMemory> serialized_engine_{nullptr};
+  TrtUniquePtr<nvinfer1::ICudaEngine> engine_{nullptr};
+  std::vector<TrtUniquePtr<nvinfer1::IExecutionContext>> contexts_;
+  std::vector<std::unique_ptr<Bindings>> bindings_;
+  int device_id_{0};
+  bool is_dynamic_shape_{false};
+};
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/trt_options.h b/paddle/infrt/backends/tensorrt/trt_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5190f5e6220e682c3d3a3ab564e381a3180caff
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/trt_options.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <NvInfer.h>
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+// Build default params
+constexpr int32_t max_batch_not_provided{0};
+constexpr int32_t default_workspace{16};
+// Inference default params
+constexpr int32_t default_batch{1};
+constexpr int32_t batch_not_provided{0};
+
+enum class PrecisionConstraints { kNONE, kOBEY, kPREFER };
+
+enum class SparsityFlag { kDISABLE, kENABLE, kFORCE };
+
+using ShapeRange =
+    std::array<std::vector<int32_t>,
+               nvinfer1::EnumMax<nvinfer1::OptProfileSelector>()>;
+
+using IOFormat = std::pair<nvinfer1::DataType, nvinfer1::TensorFormats>;
+
+struct BuildOptions {
+  // Set max batch size.
+  int32_t max_batch{max_batch_not_provided};
+
+  // Set workspace size in megabytes (default = 16)
+  int32_t workspace{default_workspace};
+
+  // Enable tf32 precision, in addition to fp32 (default = disabled)
+  bool tf32{false};
+
+  // Enable fp16 precision, in addition to fp32 (default = disabled)
+  bool fp16{false};
+
+  // Enable int8 precision, in addition to fp32 (default = disabled)
+  bool int8{false};
+
+  // Control precision constraints. (default = none)
+  // Precision Constaints: = none, obey, prefer
+  //     none = no constraints
+  //     prefer = meet precision constraints if possible
+  //     obey = meet precision constraints or fail otherwise
+  PrecisionConstraints precision_constraints{PrecisionConstraints::kNONE};
+
+  // Save the serialized engine.
+  bool save{false};
+
+  // Load a serialized engine.
+  bool load{false};
+
+  // Build with dynamic shapes using a profile with the min, max and opt shapes
+  // provided
+  std::unordered_map<std::string, ShapeRange> shapes;
+
+  // Type and format of each of the input tensors (default = all inputs in
+  // fp32:chw)
+  std::vector<IOFormat> input_formats;
+
+  // Type and format of each of the output tensors (default = all outputs in
+  // fp32:chw)
+  std::vector<IOFormat> output_formats;
+};
+
+struct InferenceOptions {
+  int32_t batch{batch_not_provided};
+  std::unordered_map<std::string, std::vector<int32_t>> shapes;
+};
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b129af1d53810c6d37d23270c1118023ae7b3f6
--- /dev/null
+++ b/paddle/infrt/backends/tensorrt/trt_utils.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include "glog/logging.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace backends {
+namespace tensorrt {
+
+#define IS_TRT_VERSION_GE(version)                       \
+  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
+
+#define IS_TRT_VERSION_LT(version)                       \
+  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) < version)
+
+#define TRT_VERSION                                    \
+  NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+      NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD
+
+inline nvinfer1::Dims VecToDims(const std::vector<int>& vec) {
+  int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
+  if (static_cast<int>(vec.size()) > limit) {
+    assert(false);
+  }
+  // Pick first nvinfer1::Dims::MAX_DIMS elements
+  nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
+  std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
+  return dims;
+}
+
+template <typename T>
+struct TrtDestroyer {
+  void operator()(T* t) { t->destroy(); }
+};
+
+template <typename T>
+using TrtUniquePtr = std::unique_ptr<T, TrtDestroyer<T>>;
+
+class TrtLogger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity,
+           const char* msg) noexcept override {
+    switch (severity) {
+      case Severity::kVERBOSE:
+        VLOG(3) << msg;
+        break;
+      case Severity::kINFO:
+        VLOG(2) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+  nvinfer1::ILogger& GetTrtLogger() noexcept { return *this; }
+  ~TrtLogger() override = default;
+};
+
+struct Binding {
+  bool is_input{false};
+  nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT};
+  phi::DenseTensor* buffer{nullptr};
+  std::string name;
+};
+
+class Bindings {
+ public:
+  Bindings() = default;
+
+  void AddBinding(int32_t b,
+                  const std::string& name,
+                  bool is_input,
+                  phi::DenseTensor* buffer,
+                  nvinfer1::DataType data_type) {
+    while (bindings_.size() <= static_cast<size_t>(b)) {
+      bindings_.emplace_back();
+    }
+    names_[name] = b;
+    bindings_[b].buffer = buffer;
+    bindings_[b].is_input = is_input;
+    bindings_[b].data_type = data_type;
+    bindings_[b].name = name;
+  }
+
+  std::vector<Binding> GetInputBindings() {
+    return GetBindings([](const Binding& b) -> bool { return b.is_input; });
+  }
+
+  std::vector<Binding> GetOutputBindings() {
+    return GetBindings([](const Binding& b) -> bool { return !b.is_input; });
+  }
+
+  std::vector<Binding> GetBindings() {
+    return GetBindings([](const Binding& b) -> bool { return true; });
+  }
+
+  std::vector<Binding> GetBindings(
+      std::function<bool(const Binding& b)> predicate) {
+    std::vector<Binding> bindings;
+    for (const auto& b : bindings_) {
+      if (predicate(b)) {
+        bindings.push_back(b);
+      }
+    }
+    return bindings;
+  }
+
+ private:
+  std::unordered_map<std::string, int32_t> names_;
+  std::vector<Binding> bindings_;
+};
+
+}  // namespace tensorrt
+}  // namespace backends
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
index a8e7e13a681caa4891c42ac01d2a759d878594d1..3ef73171dcdea4e0367837f4b3893405c29a1580 100644
--- a/paddle/infrt/dialect/infrt_base.h
+++ b/paddle/infrt/dialect/infrt_base.h
@@ -54,6 +54,20 @@ static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
   return b.getIntegerAttr(b.getI32Type(), constant);
 }
 
+template <typename T>
+static mlir::IntegerAttr createSI32Attr(mlir::OpBuilder &b,  // NOLINT
+                                        mlir::Location loc,
+                                        T constant) {
+  return b.getSI32IntegerAttr(constant);
+}
+
+template <typename T>
+static mlir::FloatAttr createF32Attr(mlir::OpBuilder &b,  // NOLINT
+                                     mlir::Location loc,
+                                     T constant) {
+  return b.getF32FloatAttr(constant);
+}
+
 static mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
     const mlir::Value &operand) {
   return mlir::SmallVector<mlir::Value, 4>(1, operand);
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
index 4d4727ee8e185032c6530cd293b0545283660e46..0f50eb2d8fb4ac83578f13888d05188a9143382f 100644
--- a/paddle/infrt/dialect/infrt_base.td
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -28,6 +28,12 @@ def BufferType : OpaqueType<"b", "buffer", "buffer">;
 class INFRT_createI32Attr<string value> : NativeCodeCall<
     "infrt::createI32Attr($_builder, $_loc, " # value # ")">;
 
+class INFRT_createSI32Attr<string value> : NativeCodeCall<
+    "infrt::createSI32Attr($_builder, $_loc, " # value # ")">;
+
+class INFRT_createF32Attr<string value> : NativeCodeCall<
+    "infrt::createF32Attr($_builder, $_loc, " # value # ")">;
+
 def INFRT_cvtValueToValueRange : NativeCodeCall<
     "infrt::cvtValueToValueRange($0)">;
 
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index 7cf5b2fb20f527eefe31f817c7fe85c7864c8669..338b04e001320289b71f6127318e7a073cefcacf 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -24,11 +24,11 @@
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
 
-#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
-
 namespace mlir {
 namespace pd {
 
+#include "paddle/infrt/dialect/rewrite.cpp.inc"  // NOLINT
+
 PaddleDialect::PaddleDialect(MLIRContext *context)
     : Dialect("pd", context, TypeID::get<PaddleDialect>()) {
   addOperations<
diff --git a/paddle/infrt/dialect/tensorrt/CMakeLists.txt b/paddle/infrt/dialect/tensorrt/CMakeLists.txt
index 794266513eb81b36655f44bfd1f6623216690ac5..99c335ed1782e8089f77bb3f21aadb00f6f6864f 100755
--- a/paddle/infrt/dialect/tensorrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/tensorrt/CMakeLists.txt
@@ -2,11 +2,13 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     trt_ops.cc
+    trt_op_converter_pass.cc
     trt_op_teller_pass.cc
     trt_graph_fuse_pass.cc
     trt_graph_split_pass.cc
     )
 mlir_tablegen_on(trt_ops)
+mlir_add_rewriter(pd_lower_to_trt)
 
 add_executable(trt-exec trt_exec.cc)
 target_link_libraries(trt-exec infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
new file mode 100644
index 0000000000000000000000000000000000000000..701391a750354938efe3703ef8642b21f8a878ea
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -0,0 +1,28 @@
+#ifndef PD_LOWER_TO_TRT
+#define PD_LOWER_TO_TRT
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/tensorrt/trt_ops.td"
+
+def PD2TRT_Matmul_Lower : Pat<
+        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">, ConstantAttr<SI32Attr, "1">),
+        (TRT_MatrixMultiplyOp $X, $transpose_X, $Y, $transpose_Y)>;
+
+//TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum nvinfer1::ElementWiseOperation::kSUM
+def PD2TRT_ElementwiseAdd_Lower : Pat<
+        (PD_Elementwise_addOp $X, $Y, ConstantAttr<SI32Attr, "-1">),
+        (TRT_ElementWiseOp $X, $Y, (INFRT_createSI32Attr<"0">)/*kSUM*/)>;
+
+//TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum  nvinfer1::ActivationType::kRELU
+def PD2TRT_Relu_Lower : Pat<
+        (PD_ReluOp $X),
+        (TRT_ActivationOp $X, (INFRT_createSI32Attr<"0">)/*kRELU*/, (INFRT_createF32Attr<"0.0">), (INFRT_createF32Attr<"0.0">))>;
+
+//TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum  nvinfer1::ActivationType::kCLIP
+def PD2TRT_Relu6_Lower : Pat<
+        (PD_Relu6Op $X, $threshold),
+        (TRT_ActivationOp $X, (INFRT_createSI32Attr<"8">)/*kCLIP*/, (INFRT_createF32Attr<"0.0">), $threshold)>;
+
+#endif // PD_LOWER_TO_TRT
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index 1baef7a3f77fdd9d3e363110ea3679aa942e222f..7af1fa53d12e3113d0fe51e7ba15bbd5c082456c 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -19,6 +19,7 @@
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
 int main(int argc, char** argv) {
@@ -36,9 +37,10 @@ int main(int argc, char** argv) {
   mlir::PassManager pm(context);
 
   mlir::OpPassManager& trt_pass_manager = pm.nest<mlir::FuncOp>();
-  trt_pass_manager.addPass(std::make_unique<infrt::trt::trtOpTellerPass>());
-  trt_pass_manager.addPass(std::make_unique<infrt::trt::trtGraphFusePass>());
-  trt_pass_manager.addPass(std::make_unique<infrt::trt::trtGraphSplitPass>(10));
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpTellerPass>());
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphFusePass>());
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphSplitPass>(1));
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpConverterPass>());
   if (mlir::failed(pm.run(*module))) {
     std::cout << "\npass failed!\n" << std::endl;
     return 4;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index 1da80ef2c3b1000c045327510a03081f8aa954ca..17633a4e8e99293524e5ca635069267e27c2a603 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -142,7 +142,7 @@ void topoSortBlock(mlir::Block &body) {  // NOLINT
 }  // namespace
 
 // Implementation of the trtGraphFusePass.
-void trtGraphFusePass::runOnFunction() {
+void TRTGraphFusePass::runOnFunction() {
   mlir::Block &body = getFunction().front();
   mlir::OpBuilder builder(&body, body.begin());
   bool changed = false;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index f1e555c6f67ecaadff76fb17f68ebaae1a6528e1..ebd7a4ac4bd3712d98df4a097682787b3977ebfb 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -52,8 +52,8 @@ namespace trt {
  *  "pd.fetch" %d, %f
  * }
  */
-class trtGraphFusePass
-    : public mlir::PassWrapper<trtGraphFusePass, mlir::FunctionPass> {
+class TRTGraphFusePass
+    : public mlir::PassWrapper<TRTGraphFusePass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 257f2b528542557db33121a4c304eb8e6f657007..f24b9cc40cdcc2b065ea033cb03638e8d292df89 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -21,7 +21,7 @@
 namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
-void trtGraphSplitPass::runOnFunction() {
+void TRTGraphSplitPass::runOnFunction() {
   std::vector<mlir::pd::GraphOp> worklist;
   mlir::Block& block = getFunction().front();
   for (auto& op : block) {
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index d30d186647fc32aa4e16047000ee4071effb900d..51f84227243403f5a2299d820acad1b49592abc3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -45,12 +45,12 @@ namespace trt {
  *  "pd.fetch" (%d, %f)
  * }
  */
-class trtGraphSplitPass
-    : public mlir::PassWrapper<trtGraphSplitPass, mlir::FunctionPass> {
+class TRTGraphSplitPass
+    : public mlir::PassWrapper<TRTGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
   void runOnFunction() override;
-  explicit trtGraphSplitPass(size_t min_subgraph_size = 3)
+  explicit TRTGraphSplitPass(size_t min_subgraph_size = 3)
       : min_subgraph_size_(min_subgraph_size) {}
 
  private:
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e34308a2f0fa8c3c0142a62324f00c29b61fd7d3
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "paddle/infrt/dialect/infrt_base.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+
+namespace infrt {
+namespace trt {
+
+#include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT
+
+using namespace mlir;
+
+void TRTOpConverterPass::runOnOperation() {
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering.
+  ConversionTarget target(getContext());
+
+  // We define the specific operations, or dialects, that are legal targets for
+  // this lowering. In our case, we are lowering to TensorRTDialect from
+  // PaddleDialect
+  target.addLegalDialect<TensorRTDialect>();
+
+  // Now that the conversion target has been defined, we just need to provide
+  // the set of patterns that will lower the TensorRT operations.
+  RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+
+  // With the target and rewrite patterns defined, we can now attempt the
+  // conversion. The conversion will signal failure if any of our `illegal`
+  // operations were not converted successfully.
+  if (failed(
+          applyPartialConversion(getOperation(), target, std::move(patterns))))
+    signalPassFailure();
+}
+
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..0adbf11b89144b0a9e14dc158e2eab1c56e2563a
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "mlir/IR/Dialect.h"
+#include "mlir/Pass/Pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+
+namespace infrt {
+namespace trt {
+/*
+ * trtOpConverterPass.
+ *
+ * source ir:
+ * func @main() -> tensor<?xf32> {
+ *   %a = "pd.feed"()...
+ *   %d, %f = "pd.graph"(%a) {
+ *     %m = "pd.conv2d"(%a)...
+ *     %n = "pd.conv3d"(%m)...
+ *     %s = "pd.conv2d"(%a)...
+ *     "pd.return" %n, %s
+ *   } ...
+ *   "pd.fetch" %d, %f
+ * }
+ *
+ * destination ir:
+ * func @main() -> tensor<?xf32> {
+ *   %a = "pd.feed"()...
+ *   %d, %f = "pd.graph"(%a) {
+ *     %m = "trt.Convolution"(%a)...
+ *     %n = "trt.Convolution"(%m)...
+ *     %s = "trt.Convolution"(%a)...
+ *     "pd.return" %n, %s
+ *   } ...
+ *   "pd.fetch" %d, %f
+ * }
+ */
+struct TRTOpConverterPass
+    : public mlir::PassWrapper<TRTOpConverterPass,
+                               mlir::OperationPass<mlir::FuncOp>> {
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<TensorRTDialect>();
+  }
+  ::llvm::StringRef getName() const override { return "trtOpConverterPass"; }
+  void runOnOperation() final;
+};
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 4e8d40b982b2eaf13aeef4f026d783c3f353c14b..176fdb7a2e054ac2e0c952c7af27995cf8e3c433 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -20,7 +20,7 @@
 namespace infrt {
 namespace trt {
 // Implementation of the trtOpTellerPass。
-void trtOpTellerPass::runOnFunction() {
+void TRTOpTellerPass::runOnFunction() {
   mlir::Block &body = getFunction().front();
   std::vector<mlir::Operation *> worklist;
   worklist.reserve(body.getOperations().size());
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index fb16c974f7fb3f923bdc460d62d8e5b9f628fff9..8b9a16376ce5527b2133c9f2c2ecea928fb4cd8f 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -52,8 +52,8 @@ namespace trt {
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
  */
-class trtOpTellerPass
-    : public mlir::PassWrapper<trtOpTellerPass, mlir::FunctionPass> {
+class TRTOpTellerPass
+    : public mlir::PassWrapper<TRTOpTellerPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index cc072b6e6885bb68df5cf216fe210aded8a6ec6a..8e3dfffff54f13cc6d1f23c3459ed45257082d4f 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -23,8 +23,48 @@ def TRT_GraphOp : TRT_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> {
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
-  
+  let arguments = (ins Variadic<TRT_Tensor>:$inputs);
   let results = (outs Variadic<TRT_Tensor>:$outputs);
 
 }
+
+def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
+  let summary = "TensorRT IActivationLayer";
+  let description = [{
+    
+    TensorRT IActivationLayer.
+    
+  }];
+  let arguments = (ins  TRT_Tensor:$input, SI32Attr:$activation_type,
+                        DefaultValuedAttr<F32Attr, "0.0">:$alpha,
+                        DefaultValuedAttr<F32Attr, "0.0">:$beta);
+
+  let results = (outs TRT_Tensor:$output);
+}
+
+def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
+  let summary = "TensorRT IElementWiseLayer";
+  let description = [{
+    
+    TensorRT IElementWiseLayer.
+    
+  }];
+  let arguments = (ins  TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation);
+
+  let results = (outs TRT_Tensor:$output);
+}
+
+def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
+  let summary = "TensorRT IMatrixMultiplyLayer";
+  let description = [{
+    
+    TensorRT IMatrixMultiplyLayer.
+    
+  }];
+  let arguments = (ins  TRT_Tensor:$input1, BoolAttr:$transpose1,
+                        TRT_Tensor:$input2, BoolAttr:$transpose2);
+
+  let results = (outs TRT_Tensor:$output);
+}
+
 #endif  // TRT_OPS
diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt
index e21cacfbc10b3eaa13004f3aa71a3cb6c9c6f5e8..7055c0c06d5905fa738d8df72c7110fdd82a30d2 100644
--- a/paddle/infrt/kernel/phi/CMakeLists.txt
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -18,6 +18,10 @@ set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/gener
 
 add_custom_command(
         OUTPUT ${infrt_register_phi_kernels_gen_source_file}
+        COMMAND sh ${infrt_register_phi_kernels_gen_file}
+        DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
+        VERBATIM)
+add_custom_target(infrt_register_phi_kernel
         COMMAND sh ${infrt_register_phi_kernels_gen_file}
         DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
         COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}"
@@ -25,7 +29,7 @@ add_custom_command(
 
 cc_library(infrt_naive SRCS infershaped/infershaped_kernel_launcher.cc
         infershaped/infershaped_kernel_launchers.cc
-        DEPS pten wrapped_infermeta)
+        DEPS phi wrapped_infermeta)
 
 cc_test_tiny(test_infrt_infershape_launchers SRCS
 infershaped/infershape_launchers_test.cc DEPS infrt)
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 331ebcfb4a5d2b1444f1ed475c5f6467f6fb0361..2161e98fac8337a766cfcf7eaa27b4486c48dfcb 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) {
   host_context::KernelRegistry registry;
   RegisterInferShapeLaunchers(&registry);
   ASSERT_GE(registry.size(), 1UL);
-  auto creator = registry.GetKernel("add.cpu.any.fp32");
+  auto creator = registry.GetKernel("pten.add.cpu.any.fp32");
 
   const phi::DDim dims({1, 2});
   const phi::DataType dtype{phi::DataType::FLOAT32};
diff --git a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
index 75ec98f04661a7d8cfe55c5fbea9dbc87933ad18..b59cfb04816974cbdb923e6d18af1184be963c59 100644
--- a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
@@ -7,15 +7,15 @@ func @main() -> tensor<?xf32> {
   %bias1 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
   %bias2 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
 
-  %d = "pd.elementwise_add"(%c, %bias) {axis=1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
 
   %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
 
   %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
   "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index cc95e0bf8fdcc5f97872ef84917ea5910b00980c..7b074d0ebb76d110dc361140bd42f78ef54f224b 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -1,5 +1,5 @@
-# pten auto cmake utils
-include(pten)
+# phi auto cmake utils
+include(phi)
 
 # paddle experimental common components
 add_subdirectory(common)
@@ -23,16 +23,16 @@ add_subdirectory(tools)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
-get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
+set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
+get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 # keep this message for debug, remove it later if needless
-message(STATUS "All standard pten kernels: ${pten_kernels}")
-set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
+message(STATUS "All standard phi kernels: ${phi_kernels}")
+set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
-cc_library(pten DEPS ${PTEN_DEPS})
+cc_library(phi DEPS ${PHI_DEPS})
 
-set(pten_extension_header_file ${CMAKE_CURRENT_SOURCE_DIR}/extension.h CACHE INTERNAL "phi/extension.h file")
-file(WRITE ${pten_extension_header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n")
+set(phi_extension_header_file ${CMAKE_CURRENT_SOURCE_DIR}/extension.h CACHE INTERNAL "phi/extension.h file")
+file(WRITE ${phi_extension_header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n")
 
 # generate inner headers include dir for users
 generate_unify_header(backends)
diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt
index a993cb3ff8041dcaa9734687c0409aaa3e6cebc8..d632db046d15ca73837292a5cb1e44479ab2c6ed 100644
--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_subdirectory(lib)
-cc_library(pten_api SRCS all.cc DEPS pten_function_api pten_bw_function_api manual_api sparse_api)
+cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api)
diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h
index 06f3cd844760616b44a1bece9a889a1a2a5f61e9..154b84670aaf992833fccf9297d8b16a081e173f 100644
--- a/paddle/phi/api/all.h
+++ b/paddle/phi/api/all.h
@@ -24,13 +24,12 @@ limitations under the License. */
 #endif
 #endif
 
-// new pten apis
+// new phi apis
 #include "paddle/phi/api/include/api.h"
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/include/tensor.h"
 
-// pten common headers
+// phi common headers
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index db0c28198e80a863030b740d192ef662be43fba6..c268742fa567bffecb2fd17a773ab56aee019853 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -49,8 +49,6 @@ namespace paddle {
 
 namespace experimental {
 
-class CompatiblePTenTensorUtils;
-
 class AbstractAutogradMeta {
  public:
   // No AbstractAutogradMeta should be created
@@ -59,7 +57,7 @@ class AbstractAutogradMeta {
 
 /**
  * Tensor is the API description of the basic data structure in the
- * [ "Paddle Tensor Operation (pten)" Library ].
+ * [ "Paddle Tensor Operation (phi)" Library ].
  *
  * It is not limited to a simple n-dimensional array.
  * It contains a smart pointer to `TensorImpl`. The data description contained
@@ -366,7 +364,7 @@ class PADDLE_API Tensor final {
   /* Part 5: Data Transform methods */
   /* Alert!!!!: All copy method can only deep copy impl, autograd info only be
    * copied */
-  /* out of pten */
+  /* out of phi */
   /**
    * @brief Copy the current Tensor data to the specified device
    * and return the new Tensor. It's usually used to set the input tensor data.
@@ -476,9 +474,6 @@ class PADDLE_API Tensor final {
 
   /* Part 9: Auto generated Tensor methods */
 
- private:
-  friend class CompatiblePTenTensorUtils;
-
  private:
   /**
    * [ Why use abstract TensorImpl interface here? ]
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 1ebddc3d3cd1baefcfcb362806d522fe2b3bcb72..5edb83f8c3fc01d198d3f63b64047b9e45cd747b 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -3,11 +3,11 @@ add_subdirectory(utils)
 cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place)
 
 if (WITH_GPU)
-  nv_library(pten_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api)
+  nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce)
 elseif (WITH_ROCM)
-  hip_library(pten_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api)
+  hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce)
 else()
-  cc_library(pten_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api)
+  cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce)
 endif()
 
 set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py)
@@ -83,17 +83,16 @@ add_custom_command(
   DEPENDS ${api_yaml_file} ${wrapped_infermeta_gen_file} ${api_gen_base}
   VERBATIM)
 
-cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor_raw pten_context kernel_factory)
-cc_library(pten_data_transform SRCS data_transform.cc DEPS pten_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
-cc_library(manual_api SRCS manual_api.cc DEPS pten_tensor_raw pten kernel_dispatch pten_data_transform)
+cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
+cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
 
-cc_library(pten_tensor SRCS tensor_method.cc DEPS pten_tensor_raw pten_function_api)
+cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
+cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
+cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
 
-cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
+cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl)
+cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl)
 
-cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS pten)
-
-cc_library(sparse_api SRCS sparse_api.cc DEPS pten_tensor pten kernel_dispatch pten_data_transform)
-cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform wrapped_infermeta)
-cc_library(pten_dygraph_api SRCS ${dygraph_api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform)
-cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta pten_data_transform pten_function_api)
+cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
diff --git a/paddle/phi/api/lib/manual_api.cc b/paddle/phi/api/lib/api_custom_impl.cc
similarity index 55%
rename from paddle/phi/api/lib/manual_api.cc
rename to paddle/phi/api/lib/api_custom_impl.cc
index 7bd4711cc3f308173ce6fd12225faa46f516cb91..c7400b93fcdc18314318fae9482e1e5e5bfb8aef 100644
--- a/paddle/phi/api/lib/manual_api.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/api/include/manual_api.h"
-
-#include <memory>
-
-#include "glog/logging.h"
+#include "paddle/phi/api/lib/api_custom_impl.h"
 
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/api_utils.h"
@@ -25,82 +21,57 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 
-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
-#endif
-
-#ifdef PADDLE_WITH_XPU
-PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
-#endif
+#include "glog/logging.h"
 
 namespace paddle {
 namespace experimental {
 
-PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
-  // 1. Get kernel signature and kernel
+Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       "copy", kernel_key);
 
-  VLOG(0) << "to API kernel key: " << kernel_key;
-  VLOG(0) << "to API kernel: " << kernel;
+  VLOG(6) << "copy API kernel key: " << kernel_key;
+  VLOG(6) << "copy API kernel: " << kernel;
 
-  // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = phi::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<phi::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x.get());
-  kernel_context.EmplaceBackAttr(blocking);
-
-  // 4. Prepare outputs & InferMeta
-  auto dense_out = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
-      phi::DenseTensorMeta());
-  phi::MetaTensor meta_out(dense_out.get());
-  phi::UnchangedInferMeta(*dense_x, &meta_out);
-  dense_out->mutable_data(phi::TransToPtenPlace(backend));
-  kernel_context.EmplaceBackOutput(dense_out.get());
+
+  auto dense_x = TensorToDenseTensor(x);
+
   Tensor out;
-  out.set_impl(dense_out);
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
+  phi::MetaTensor meta_out(kernel_out);
+  phi::UnchangedInferMeta(*dense_x, &meta_out);
 
-  // 5. Call kernel
-  kernel(&kernel_context);
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    phi::Place,
+                                    bool,
+                                    phi::DenseTensor*);
+
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+  (*kernel_fn)(
+      *dev_ctx, *dense_x, phi::TransToPhiPlace(backend), blocking, kernel_out);
 
   return out;
 }
 
-PADDLE_API std::vector<Tensor> split(const Tensor& x,
-                                     const ScalarArray& num_or_sections,
-                                     const Scalar& axis) {
-  Backend kernel_backend = Backend::UNDEFINED;
-  DataLayout kernel_layout = DataLayout::UNDEFINED;
-  DataType kernel_data_type = DataType::UNDEFINED;
-
-  if (kernel_backend == Backend::UNDEFINED ||
-      kernel_layout == DataLayout::UNDEFINED ||
-      kernel_data_type == DataType::UNDEFINED) {
-    auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-    if (kernel_backend == Backend::UNDEFINED) {
-      kernel_backend = kernel_key.backend();
-    }
-    if (kernel_layout == DataLayout::UNDEFINED) {
-      kernel_layout = kernel_key.layout();
-    }
-    if (kernel_data_type == DataType::UNDEFINED) {
-      kernel_data_type = kernel_key.dtype();
-    }
-  }
+std::vector<Tensor> split_impl(const Tensor& x,
+                               const ScalarArray& num_or_sections,
+                               const Scalar& axis) {
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
+  Backend kernel_backend = kernel_key.backend();
+  DataLayout kernel_layout = kernel_key.layout();
+  DataType kernel_data_type = kernel_key.dtype();
 
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       "split", {kernel_backend, kernel_layout, kernel_data_type});
@@ -144,7 +115,6 @@ PADDLE_API std::vector<Tensor> split(const Tensor& x,
 
   return out;
 }
+
 }  // namespace experimental
 }  // namespace paddle
-
-PD_REGISTER_API(Utils);
diff --git a/paddle/phi/api/include/manual_api.h b/paddle/phi/api/lib/api_custom_impl.h
similarity index 61%
rename from paddle/phi/api/include/manual_api.h
rename to paddle/phi/api/lib/api_custom_impl.h
index 72d348f33918ce545bc7ecf4517d40756cbb1343..5acb68a3281332565d0b094a37fc8ee38c4904ab 100644
--- a/paddle/phi/api/include/manual_api.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,22 +19,15 @@ limitations under the License. */
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 
-/**
- * This file stores some special APIs that are implemented manually
- * or difficult to automatically generated.
- */
-
 namespace paddle {
 namespace experimental {
 
 // TODO(chenweihang): Replace backend by place when place is ready
-PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking);
+Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking);
 
-// TODO(chentianyu03): Split API has extra logic to calculate the outputs size,
-// api_gen do not support
-PADDLE_API std::vector<Tensor> split(const Tensor& x,
-                                     const ScalarArray& num_or_sections,
-                                     const Scalar& axis);
+std::vector<Tensor> split_impl(const Tensor& x,
+                               const ScalarArray& num_or_sections,
+                               const Scalar& axis);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_declare.h b/paddle/phi/api/lib/api_declare.h
index 26408290bd325e60952f8f88d413b90451544044..a5d3578d681b6f20992b4560c4dbca4fcd7089a7 100644
--- a/paddle/phi/api/lib/api_declare.h
+++ b/paddle/phi/api/lib/api_declare.h
@@ -18,5 +18,4 @@ limitations under the License. */
 #include "paddle/phi/api/lib/api_registry.h"
 
 PD_DECLARE_API(Math);
-PD_DECLARE_API(Utils);
 PD_DECLARE_API(SparseApi);
diff --git a/paddle/phi/api/lib/api_registry.h b/paddle/phi/api/lib/api_registry.h
index 3783620ea449b46ab17ae1ac7d9f7e80ef08cae9..212a2f96452f69496d9ca60fdc3c8cdb643b9679 100644
--- a/paddle/phi/api/lib/api_registry.h
+++ b/paddle/phi/api/lib/api_registry.h
@@ -27,7 +27,7 @@ namespace experimental {
 #endif
 
 /**
- * Now there is no module to call pten's API. When compiling, the function
+ * Now there is no module to call phi's API. When compiling, the function
  * implementation will be optimized. Therefore, the symbol will be exposed
  * manually for the time being.
  *
@@ -41,7 +41,7 @@ namespace experimental {
 
 #define PD_DECLARE_API(name)                        \
   extern PADDLE_API int RegisterSymbolsFor##name(); \
-  UNUSED static int use_pten_api_##name = RegisterSymbolsFor##name()
+  UNUSED static int use_phi_api_##name = RegisterSymbolsFor##name()
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_utils.h b/paddle/phi/api/lib/api_utils.h
index 948e40cd28d5847bbcfb0c4c0ec8f1f39246d22f..6c1fa97c0f52a697383a3526220cc758d778823d 100644
--- a/paddle/phi/api/lib/api_utils.h
+++ b/paddle/phi/api/lib/api_utils.h
@@ -31,6 +31,14 @@ inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
   return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
 }
 
+inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+    const paddle::optional<Tensor>& tensor) {
+  if (tensor) {
+    return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl());
+  }
+  return nullptr;
+}
+
 inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
     const std::vector<Tensor>& tensors) {
   auto pt_tensors = std::make_unique<std::vector<phi::DenseTensor>>();
@@ -49,12 +57,28 @@ inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
   return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl());
 }
 
+inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor) {
+  if (tensor) {
+    return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl());
+  }
+  return nullptr;
+}
+
 /* ----------------- for infer_meta --------------------- */
 
 inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
   return phi::MetaTensor(tensor);
 }
 
+inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::DenseTensor&>& tensor) {
+  if (tensor) {
+    return {phi::MetaTensor(*tensor)};
+  }
+  return {paddle::none};
+}
+
 inline std::vector<phi::MetaTensor> MakeMetaTensor(
     const std::vector<phi::DenseTensor>& tensors) {
   std::vector<phi::MetaTensor> meta_tensors;
@@ -69,12 +93,20 @@ inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
   return phi::MetaTensor(tensor);
 }
 
+inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::SelectedRows&>& tensor) {
+  if (tensor) {
+    return {phi::MetaTensor(*tensor)};
+  }
+  return {paddle::none};
+}
+
 /* ------------------ for output ----------------------- */
 
 inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto dense_tensor = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPtenPlace(backend)),
+        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
         phi::DenseTensorMeta());
     out->set_impl(dense_tensor);
     return dense_tensor.get();
@@ -88,7 +120,7 @@ inline std::vector<phi::DenseTensor*> SetKernelOutput(
   std::vector<phi::DenseTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
     auto tensor_ptr = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPtenPlace(backend)),
+        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
         phi::DenseTensorMeta());
     results[i] = tensor_ptr.get();
     out->emplace_back();
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 9fd91f398f7f47133bc0b13b632860c531d87995..ae67e2ebb35ccef7fe07ee8c76db33a459b1dfce 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -38,7 +38,7 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
                                const TransformFlag& transform_flag) {
   bool ret = transform_flag.need_trans_backend() &&
              target != Backend::ALL_BACKEND &&
-             !platform::is_same_place(input, phi::TransToPtenPlace(target));
+             !platform::is_same_place(input, phi::TransToPhiPlace(target));
   return ret;
 }
 
@@ -168,10 +168,10 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
           out.place(), target_args_def.backend, transform_flag)) {
     phi::DenseTensor result(
         phi::make_intrusive<paddle::experimental::SharedStorage>(
-            phi::TransToPtenPlace(target_args_def.backend)),
+            phi::TransToPhiPlace(target_args_def.backend)),
         {out.dtype(), out.dims(), out.layout()});
     framework::TransDataDevice(
-        out, phi::TransToPtenPlace(target_args_def.backend), &result);
+        out, phi::TransToPhiPlace(target_args_def.backend), &result);
     out = result;
   }
   return out;
@@ -199,6 +199,16 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
   return std::make_shared<phi::DenseTensor>(out);
 }
 
+std::shared_ptr<phi::DenseTensor> PrepareData(
+    const paddle::optional<Tensor>& input,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag) {
+  if (input) {
+    return PrepareData(*input, target_args_def, transform_flag);
+  }
+  return {nullptr};
+}
+
 std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
     const std::vector<Tensor>& inputs,
     const phi::TensorArgDef& target_args_def,
diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h
index 9942b2f90b03becca5706f773339eb80fd3a4be8..8eb1c4a179aed832bdd7b69dd0112ab46107a718 100644
--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -66,6 +66,11 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag);
 
+std::shared_ptr<phi::DenseTensor> PrepareData(
+    const paddle::optional<Tensor>& input,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag);
+
 std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
     const std::vector<Tensor>& inputs,
     const phi::TensorArgDef& target_args_def,
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index 5251473f3b5c9ab272499436c8a2091725449644..0e3ca1af4967c2bf2ae302ea656a31198d187f01 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -21,7 +21,7 @@ namespace experimental {
 namespace detail {
 
 BackendSet GetTensorBackendSet(const Tensor& t) {
-  BackendSet backend_set(phi::TransToPtenBackend(t.inner_place()));
+  BackendSet backend_set(phi::TransToPhiBackend(t.inner_place()));
   switch (t.layout()) {
     case DataLayout::MKLDNN:
       backend_set = backend_set | BackendSet(Backend::MKLDNN);
@@ -53,7 +53,7 @@ std::size_t CountLeadingZeros(uint64_t val) {
 
 phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
-  return pool.Get(phi::TransToPtenPlace(backend));
+  return pool.Get(phi::TransToPhiPlace(backend));
 }
 
 DataType ParseDataType(DataType dtype) { return dtype; }
@@ -83,7 +83,7 @@ DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) {
 
 Backend ParseBackend(Backend backend) { return backend; }
 Backend ParseBackend(const Tensor& tensor) {
-  return phi::TransToPtenBackend(tensor.inner_place());
+  return phi::TransToPhiBackend(tensor.inner_place());
 }
 
 Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) {
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index ad315ededf5d77a550375855b70bf3927c316941..9a09bc2183ad73857d5afee8909d957e65c5a664 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -51,7 +51,7 @@ struct KernelKeySet {
   DataType dtype{DataType::UNDEFINED};
 
   // TODO(chenweihang): iterate all kernelkey for kernel selection
-  phi::KernelKey GetHigestPriorityKernelKey() {
+  phi::KernelKey GetHighestPriorityKernelKey() {
     return phi::KernelKey(static_cast<Backend>(64 - detail::CountLeadingZeros(
                                                         backend_set.bitset())),
                           layout,
diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api.cc
index cc90c2b819daefd725a71f2787d75e42e37899bd..9e1f59c0aa74329b15efcbff123b137fbf0b1360 100644
--- a/paddle/phi/api/lib/sparse_api.cc
+++ b/paddle/phi/api/lib/sparse_api.cc
@@ -51,7 +51,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_coo";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_coo";
@@ -86,11 +86,11 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_indices(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(indices_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(elements_meta));
   auto coo = std::make_shared<phi::SparseCooTensor>(
       non_zero_indices, non_zero_elements, x.dims());
@@ -112,7 +112,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_csr";
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     kernel_name = "sparse_coo_to_csr";
@@ -148,15 +148,15 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_crows(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(crows_meta));
   phi::DenseTensor non_zero_cols(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(cols_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(elements_meta));
   auto csr = std::make_shared<phi::SparseCsrTensor>(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
@@ -179,7 +179,7 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "sparse_coo_to_dense";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_dense";
@@ -211,7 +211,7 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
   // create empty SparseCooTensor
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       std::move(dense_meta));
 
   kernel_context.EmplaceBackOutput(dense_out.get());
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 963aeec328e2ad2ecf24bb100d3035d3de4251f8..311dd0fc30941d2afb9f1bc1e7ae57f3a449a254 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/api/lib/ext_compat_utils.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/api/lib/utils/storage.h"
@@ -34,7 +33,7 @@ limitations under the License. */
  *
  * We hope to organize the basic implementation of Tensor and the logic related
  * to Tensor computation into an independent library, which we call
- * [Tensor Operation Library, pten], so we extract or rewrite the original
+ * [Tensor Operation Library, phi], so we extract or rewrite the original
  * Kernels.
  *
  * In the future, the training library, inference library and custom operators
@@ -299,72 +298,7 @@ gpuStream_t Tensor::stream() const {
 }
 #endif
 
-/* Part 5: Data Transform methods */
-
-template <typename T>
-Tensor Tensor::copy_to(const PlaceType &target_place) const {
-  LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version "
-                  "2.3, and will be removed in version 2.4, please use "
-                  "`copy_to` method without template argument instead. "
-                  "reason: copying a Tensor to another device does not need "
-                  "to specify the data type template argument.";
-  return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
-}
-
-template PADDLE_API Tensor
-Tensor::copy_to<float>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<double>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<bool>(const PlaceType &target_place) const;
-template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<float>>(
-    const PlaceType &target_place) const;
-template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<double>>(
-    const PlaceType &target_place) const;
-template PADDLE_API Tensor
-Tensor::copy_to<phi::dtype::float16>(const PlaceType &target_place) const;
-
-Tensor Tensor::copy_to(Backend backend, bool blocking) const {
-  return experimental::copy_to(*this, backend, blocking);
-}
-
-void Tensor::copy_(const Tensor &src, bool blocking) {
-  if (!src.is_initialized()) {
-    return;
-  }
-  VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name();
-  if (defined()) {
-    PADDLE_ENFORCE_EQ(dtype(),
-                      src.dtype(),
-                      platform::errors::PreconditionNotMet(
-                          "Tensor %s has different data type with Tensor %s, "
-                          "Tensor Copy cannot be performed!",
-                          name(),
-                          src.name()));
-    PADDLE_ENFORCE_EQ(impl()->type_info().id(),
-                      src.impl()->type_info().id(),
-                      platform::errors::PreconditionNotMet(
-                          "Tensor %s has different type with Tensor %s, Tensor "
-                          "Copy cannot be performed!",
-                          name(),
-                          src.name()));
-  }
-  auto copy_tensor =
-      src.copy_to(phi::TransToPtenBackend(src.inner_place()), blocking);
-  set_impl(copy_tensor.impl());
-}
-
-/* Part 6: Status utils methods */
+/* Part 5: Status utils methods */
 
 bool Tensor::defined() const { return impl_ != nullptr; }
 
@@ -376,7 +310,7 @@ bool Tensor::is_initialized() const {
 
 void Tensor::reset() { impl_.reset(); }
 
-/* Part 7: Operator overloading */
+/* Part 6: Operator overloading */
 
 Tensor &Tensor::operator=(const Tensor &x) & {
   impl_ = x.impl_;
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index b67810d610f2fbd26d46efe7e2e5ff8343d62aab..aefa26952d1e5f224112576bfbd74be80cca72cc 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -14,15 +14,83 @@ limitations under the License. */
 
 #include "paddle/phi/api/include/tensor.h"
 
+#include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/tensor_base.h"
+
 namespace paddle {
 namespace experimental {
 
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
+Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
 
 Tensor Tensor::cast(DataType target_type) const {
   return experimental::cast(*this, target_type);
 }
 
+Tensor Tensor::copy_to(Backend backend, bool blocking) const {
+  return experimental::copy_to(*this, backend, blocking);
+}
+
+template <typename T>
+Tensor Tensor::copy_to(const PlaceType &target_place) const {
+  LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version "
+                  "2.3, and will be removed in version 2.4, please use "
+                  "`copy_to` method without template argument instead. "
+                  "reason: copying a Tensor to another device does not need "
+                  "to specify the data type template argument.";
+  return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
+}
+
+template PADDLE_API Tensor
+Tensor::copy_to<float>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<double>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<bool>(const PlaceType &target_place) const;
+template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<float>>(
+    const PlaceType &target_place) const;
+template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<double>>(
+    const PlaceType &target_place) const;
+template PADDLE_API Tensor
+Tensor::copy_to<phi::dtype::float16>(const PlaceType &target_place) const;
+
+void Tensor::copy_(const Tensor &src, bool blocking) {
+  if (!src.is_initialized()) {
+    return;
+  }
+  VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name();
+  if (defined()) {
+    PADDLE_ENFORCE_EQ(dtype(),
+                      src.dtype(),
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s has different data type with Tensor %s, "
+                          "Tensor Copy cannot be performed!",
+                          name(),
+                          src.name()));
+    PADDLE_ENFORCE_EQ(impl()->type_info().id(),
+                      src.impl()->type_info().id(),
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s has different type with Tensor %s, Tensor "
+                          "Copy cannot be performed!",
+                          name(),
+                          src.name()));
+  }
+  auto copy_tensor =
+      src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking);
+  set_impl(copy_tensor.impl());
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 74ecb3cd65262c3e0598134979c54c02b029d6ee..6d056b54b70058e33501083d9754aa27466c0f59 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS
+cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
 tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits)
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index fc56d201fe3ccc736fdef834e69426e5f0384bf9..31325e22afae31e55a3a2d939739d6745ccd3d36 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -31,13 +31,13 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
   }
 }
 
-std::unique_ptr<phi::DenseTensor> MakePtenDenseTensor(
+std::unique_ptr<phi::DenseTensor> MakePhiDenseTensor(
     const paddle::framework::Tensor& src) {
   return std::make_unique<phi::DenseTensor>(src);
 }
 
-phi::Scalar MakePtenScalarFromVar(const framework::Variable& variable) {
-  auto expected_place = phi::TransToPtenPlace(phi::Backend::CPU);
+phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) {
+  auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
     if (!platform::is_same_place(tensor.place(), expected_place)) {
@@ -55,21 +55,21 @@ phi::Scalar MakePtenScalarFromVar(const framework::Variable& variable) {
   }
 }
 
-phi::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src) {
+phi::ScalarArray MakePhiScalarArray(const paddle::framework::Tensor& src) {
   return {src};
 }
 
-phi::ScalarArray MakePtenScalarArrayFromVar(
+phi::ScalarArray MakePhiScalarArrayFromVar(
     const framework::Variable& variable) {
-  auto expected_place = phi::TransToPtenPlace(phi::Backend::CPU);
+  auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
     if (!platform::is_same_place(tensor.place(), expected_place)) {
       framework::LoDTensor tmp_tensor;
       framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
-      return MakePtenScalarArray(tmp_tensor);
+      return MakePhiScalarArray(tmp_tensor);
     } else {
-      return MakePtenScalarArray(tensor);
+      return MakePhiScalarArray(tensor);
     }
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -80,12 +80,12 @@ phi::ScalarArray MakePtenScalarArrayFromVar(
 }
 
 // TODO(chentianyu03): Inplace with ScalarArray constructor
-phi::ScalarArray MakePtenScalarArrayFromVarList(
+phi::ScalarArray MakePhiScalarArrayFromVarList(
     const std::vector<framework::Variable*>& variable_list) {
   if (variable_list.size() == 0) {
     return phi::ScalarArray();
   }
-  auto expected_place = phi::TransToPtenPlace(phi::Backend::CPU);
+  auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
 
   std::vector<int64_t> vector_data;
   vector_data.reserve(variable_list.size());
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 51aca6a52b41cd59858f3c138423c3debdb40eaf..8b30d5421ab943d568a046ca0fe4698849780ffd 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -30,17 +30,16 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-std::unique_ptr<phi::DenseTensor> MakePtenDenseTensor(
+std::unique_ptr<phi::DenseTensor> MakePhiDenseTensor(
     const paddle::framework::Tensor& src);
 
-phi::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src);
+phi::ScalarArray MakePhiScalarArray(const paddle::framework::Tensor& src);
 
-phi::Scalar MakePtenScalarFromVar(const framework::Variable& variable);
+phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable);
 
-phi::ScalarArray MakePtenScalarArrayFromVar(
-    const framework::Variable& variable);
+phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable);
 
-phi::ScalarArray MakePtenScalarArrayFromVarList(
+phi::ScalarArray MakePhiScalarArrayFromVarList(
     const std::vector<framework::Variable*>& variable_list);
 
 void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 38366d57841b006726de386a32a5bd09a80f05a7..43e477ef32e9c2a3d914447d610cd6f07b73a92a 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -12,16 +12,16 @@ if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
 
-cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
+cc_library(phi_context SRCS all_context.cc DEPS device_context cpu_context)
 
 if(WITH_XPU)
-  add_dependencies(pten_context xpu_context)
+  add_dependencies(phi_context xpu_context)
 endif()
 
 if(WITH_GPU)
-  add_dependencies(pten_context gpu_context)
+  add_dependencies(phi_context gpu_context)
 endif()
 
 if(WITH_CUSTOM_DEVICE)
-  add_dependencies(pten_context custom_context)
+  add_dependencies(phi_context custom_context)
 endif()
diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h
index 3fe03905e42dd33afeedb3a04c2deae6fb0ca1ee..57e6f084fd4c9a643822ddeb46418b0587cb982e 100644
--- a/paddle/phi/backends/all_context.h
+++ b/paddle/phi/backends/all_context.h
@@ -18,7 +18,7 @@ limitations under the License. */
 // In order to avoid including the header files of each backend in turn,
 // add this header file
 // Note: Limit the entry of DeviceContext to backends to avoid multiple include
-// path replacement after implementing pten DeviceContext
+// path replacement after implementing phi DeviceContext
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/custom/custom_context.h"
diff --git a/paddle/phi/backends/cpu/CMakeLists.txt b/paddle/phi/backends/cpu/CMakeLists.txt
index 965b33f3800edf9597b07ad2446637d2c505fe0f..82ea42566fc1f46a51f4dbf049dcb7470c633c25 100644
--- a/paddle/phi/backends/cpu/CMakeLists.txt
+++ b/paddle/phi/backends/cpu/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_MKLDNN)
   # TODO(wilber): support mkldnn context.
-  cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context mkldnn eigen3)
+  cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context mkldnn eigen3)
 else()
-  cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context eigen3)
+  cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context eigen3)
 endif()
diff --git a/paddle/phi/backends/cpu/cpu_context.h b/paddle/phi/backends/cpu/cpu_context.h
index e67df65850f15545d7da7a21c5edf30c53661b4d..aa14c2a8e3862139b3149bbcdcfa169d7c292377 100644
--- a/paddle/phi/backends/cpu/cpu_context.h
+++ b/paddle/phi/backends/cpu/cpu_context.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/forwards.h"
 #include "paddle/phi/core/device_context.h"
 
-// TODO(wilber): Do we need to use place in pten kernel?
+// TODO(wilber): Do we need to use place in phi kernel?
 #include "paddle/phi/common/place.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt
index 9a7de35dd4e66c687bf501845d7b079f90f42464..cb54d3675687d9ae7145c9ac01bc874e811b08f7 100644
--- a/paddle/phi/backends/custom/CMakeLists.txt
+++ b/paddle/phi/backends/custom/CMakeLists.txt
@@ -1,3 +1,3 @@
 if (WITH_CUSTOM_DEVICE)
-  cc_library(custom_context SRCS custom_context.cc DEPS pten_device_context device_manager)
+  cc_library(custom_context SRCS custom_context.cc DEPS phi_device_context device_manager)
 endif()
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index b7242fc76df7c5db69d58363de6f5427b397aaa6..bc5ef3cd5c078798c5b178df6e7c5341a6fa9d1b 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(pten_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags)
+cc_library(phi_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags)
 
 list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
 
@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
   list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 if(WITH_ROCM)
-  hip_library(pten_dynload_cuda SRCS ${HIP_SRCS} DEPS pten_dynamic_loader)
-  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+  hip_library(phi_dynload_cuda SRCS ${HIP_SRCS} DEPS phi_dynamic_loader)
+  cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc)
 elseif (WITH_ASCEND_CL)
-  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc npu_hccl)
+  cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc npu_hccl)
 else()
-  nv_library(pten_dynload_cuda SRCS ${CUDA_SRCS} DEPS pten_dynamic_loader)
-  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+  nv_library(phi_dynload_cuda SRCS ${CUDA_SRCS} DEPS phi_dynamic_loader)
+  cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc)
 endif()
 if (WITH_MKLML)
-  cc_library(pten_dynload_mklml SRCS mklml.cc DEPS pten_dynamic_loader mklml)
+  cc_library(phi_dynload_mklml SRCS mklml.cc DEPS phi_dynamic_loader mklml)
 endif()
 
-cc_library(pten_dynload_lapack SRCS lapack.cc DEPS pten_dynamic_loader)
-add_dependencies(pten_dynload_lapack extern_lapack)
+cc_library(phi_dynload_lapack SRCS lapack.cc DEPS phi_dynamic_loader)
+add_dependencies(phi_dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
 if (MKL_FOUND AND WITH_ONEMKL)
   message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(pten_dynload_mklrt SRCS mklrt.cc DEPS pten_dynamic_loader)
-  target_include_directories(pten_dynload_mklrt PRIVATE ${MKL_INCLUDE})
+  cc_library(phi_dynload_mklrt SRCS mklrt.cc DEPS phi_dynamic_loader)
+  target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
index fe98fa6bd37ef345b73bd85f1384a2574222dcb3..a526fbfd926393701e2ebb076fa9208810d2be26 100644
--- a/paddle/phi/backends/dynload/cupti.h
+++ b/paddle/phi/backends/dynload/cupti.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUPTI
 
 #include <cuda.h>
+#include <cuda_occupancy.h>
 #include <cupti.h>
 #include <mutex>  // NOLINT
 
@@ -63,7 +64,8 @@ extern void *cupti_dso_handle;
   __macro(cuptiSubscribe);                    \
   __macro(cuptiUnsubscribe);                  \
   __macro(cuptiEnableCallback);               \
-  __macro(cuptiEnableDomain);
+  __macro(cuptiEnableDomain);                 \
+  __macro(cudaOccMaxActiveBlocksPerMultiprocessor);
 
 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
 
diff --git a/paddle/phi/backends/gpu/CMakeLists.txt b/paddle/phi/backends/gpu/CMakeLists.txt
index 09591f79ae8fcdfe6430c256dd0defad272a46b3..d14e94024f90fbb00f4ef1ea6963dcc7692924fa 100644
--- a/paddle/phi/backends/gpu/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/CMakeLists.txt
@@ -1,9 +1,9 @@
 if(WITH_GPU)
   add_subdirectory(cuda)
-  nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
+  nv_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_cuda_info gflags glog enforce phi_dynload_cuda)
 elseif(WITH_ROCM)
   add_subdirectory(rocm)
-  hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
+  hip_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda)
 endif()
 
-cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
+cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3)
diff --git a/paddle/phi/backends/gpu/cuda/CMakeLists.txt b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
index 7eb1983a793bcfff6fd43040f006bafbfb5012bd..a3393f97d7559314dabf55444c1d8961705a4f05 100644
--- a/paddle/phi/backends/gpu/cuda/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
@@ -1 +1 @@
-nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
+nv_library(phi_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce phi_dynload_cuda)
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 28057abed542abd2c120d1199dab7ba776929812..dbcc1660c6472cdddaaa3bea72854f61370c19a0 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -49,7 +49,7 @@ limitations under the License. */
 // without eigen.
 #include "unsupported/Eigen/CXX11/Tensor"
 
-// TODO(pten): remove fluid header.
+// TODO(phi): remove fluid header.
 #include "paddle/fluid/platform/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 5fa80d3a577419350845d64e7b8cd44f03bb3847..603ce0817c4ebdcb17bb97b14dd0700badcf2385 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -220,4 +220,11 @@ class GPUContext : public DeviceContext {
   std::unique_ptr<Impl> impl_;
 };
 
+// Note: In order to register the kernel of CUDNN, GPUDNNContext is required.
+// Currently, CUDNN kernel directly uses GPUContext. But if the kernel function
+// has the same name, this will lead to duplicate instantiations of GPU kernel
+// and GPUDNN kernel function, so if we using GPUDNNContext = GPUContext, we
+// must use different function name for cudnn kernel
+using GPUDNNContext = GPUContext;
+
 }  // namespace phi
diff --git a/paddle/phi/backends/gpu/rocm/CMakeLists.txt b/paddle/phi/backends/gpu/rocm/CMakeLists.txt
index 181f92cbfc31c7f2a407b3f474f3361ae40cea3c..257e4cc8afbcf20966dd377c4945f9a9fa9f8579 100644
--- a/paddle/phi/backends/gpu/rocm/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/rocm/CMakeLists.txt
@@ -1 +1 @@
-hip_library(pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda)
+hip_library(phi_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce phi_dynload_cuda)
diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc
index 11dd4f724878266d52fdcbeee031b6ac6a9a9438..23e58d34b25725c048a39244d27f0afd0a917e0f 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_info.cc
+++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc
@@ -15,7 +15,7 @@
 #include <array>
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
-// TODO(pten): remove fluid headers.
+// TODO(phi): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
 
 static std::once_flag g_device_props_size_init_flag;
diff --git a/paddle/phi/backends/xpu/CMakeLists.txt b/paddle/phi/backends/xpu/CMakeLists.txt
index 65341dd206fd30c318eb72cb74c4ad3ac4ae212b..4d885757bb1a60578a923e05544d6d209b73acf9 100644
--- a/paddle/phi/backends/xpu/CMakeLists.txt
+++ b/paddle/phi/backends/xpu/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place)
-cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info)
+cc_library(phi_xpu_info SRCS xpu_info.cc DEPS enforce xpulib phi_place)
+cc_library(xpu_context SRCS xpu_context.cc DEPS phi_device_context phi_xpu_info)
diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc
index 96e95df7a9886f2bb1b5485c822a98d4f42b5f12..d454fc0734c66aca37a55c53ec5a2d9206cfcc5b 100644
--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/backends/xpu/xpu_header.h"
 #include "paddle/phi/common/place.h"
 
-// TODO(wilber): The pten computing library requires a component to manage
+// TODO(wilber): The phi computing library requires a component to manage
 // flags.
 #include "paddle/fluid/platform/flags.h"
 
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index feaf0e12bdb16b04a09814d45a25b6a504a7c697..85a1424ee34e04b50a077f5d8ac88d0a0d2fbe78 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1 +1 @@
-cc_library(pten_place SRCS place.cc)
+cc_library(phi_place SRCS place.cc)
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 1d3e4369c69489fc13ec6938fbb9377e93765bb9..4b7bf65be39cbc83688e7dab3fdd745c2be82b22 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -50,7 +50,7 @@ enum class Backend : uint8_t {
 
   // the third library backend
   MKLDNN,
-  CUDNN,
+  GPUDNN,  // cuDNN and hipDNN
 
   // end of backend types
   NUM_BACKENDS,
@@ -112,8 +112,8 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::MKLDNN:
       os << "MKLDNN";
       break;
-    case Backend::CUDNN:
-      os << "CUDNN";
+    case Backend::GPUDNN:
+      os << "GPUDNN";
       break;
     default: {
       size_t device_type_id_ = static_cast<size_t>(backend) -
@@ -145,8 +145,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
     return Backend::NPU;
   } else if (s == std::string("MKLDNN")) {
     return Backend::MKLDNN;
-  } else if (s == std::string("CUDNN")) {
-    return Backend::CUDNN;
+  } else if (s == std::string("GPUDNN")) {
+    return Backend::GPUDNN;
   } else {
     return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
                                 phi::GetOrRegisterGlobalDeviceTypeId(s));
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 1cdcdef2c12eec1c59c0fd2dfdf1c4dd6e62bd37..6ed9c88d705106ce3b03732096fa34b23422875f 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -988,6 +988,18 @@ inline std::ostream& operator<<(std::ostream& os, const float16& a) {
   return os;
 }
 
+template <typename T>
+class MPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class MPTypeTrait<float16> {
+ public:
+  using Type = float;
+};
+
 }  // namespace dtype
 }  // namespace phi
 
diff --git a/paddle/phi/common/layout.h b/paddle/phi/common/layout.h
index 30832bd60bc0ea167b37de08240aad06c0fe7d1b..648fc02d054cbfd89991e66801c1dac5dffbfe69 100644
--- a/paddle/phi/common/layout.h
+++ b/paddle/phi/common/layout.h
@@ -32,7 +32,7 @@ enum class DataLayout {
   NUM_DATA_LAYOUTS,
   // See Note [ Why we need ALL in basic kernel key member? ]
   ALL_LAYOUT = UNDEFINED,
-  // Note: Unify pten DataLayout and fluid::framework::DataLayout,
+  // Note: Unify phi DataLayout and fluid::framework::DataLayout,
   // for compatible with fluid DataLayout, here need prefix `k`
   // Note: The original `kAnyLayout (enum value 2)` is a strange design.
   // `kAnyLayout` originally cannot represent any kind of Layout,
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index bc179e8fed74e22fd85d7ff9372d816edfdce575..644bf3679af2a3ebf05f739a6e8d42011c7e664c 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -43,7 +43,7 @@ const char *AllocationTypeStr(AllocationType type) {
     case AllocationType::MLU:
       return "mlu";
     default:
-      PD_THROW("Invalid pten device type.");
+      PD_THROW("Invalid phi device type.");
       return {};
   }
 }
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 6ada0630699054ba573f018175d9ba0724216e1b..f4f57a0acbbb386a3642a05e0d0dc70cd082a4d8 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -2,30 +2,30 @@
 add_subdirectory(compat)
 
 cc_library(errors SRCS errors.cc)
-set(pten_enforce_deps errors flags)
+set(phi_enforce_deps errors flags)
 if(WITH_GPU)
-  set(pten_enforce_deps ${pten_enforce_deps} external_error_proto)
+  set(phi_enforce_deps ${phi_enforce_deps} external_error_proto)
 endif()
-cc_library(pten_enforce INTERFACE SRCS enforce.cc DEPS ${pten_enforce_deps})
+cc_library(phi_enforce INTERFACE SRCS enforce.cc DEPS ${phi_enforce_deps})
 
-cc_library(kernel_factory SRCS kernel_factory.cc DEPS pten_enforce fluid_convert_utils)
-cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context)
+cc_library(kernel_factory SRCS kernel_factory.cc DEPS phi_enforce fluid_convert_utils)
+cc_library(kernel_context SRCS kernel_context.cc DEPS phi_enforce phi_context)
 
-cc_library(ddim SRCS ddim.cc DEPS pten_enforce)
-cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS pten_enforce)
-cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce)
-cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce)
+cc_library(ddim SRCS ddim.cc DEPS phi_enforce)
+cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS phi_enforce)
+cc_library(tensor_meta SRCS tensor_meta.cc DEPS phi_enforce)
+cc_library(lod_utils SRCS lod_utils.cc DEPS phi_enforce)
 
-cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base)
 cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base)
 cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base)
 cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base)
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor pten_enforce ddim memcpy)
+cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
+cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
 
-cc_library(pten_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
+cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
 
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
diff --git a/paddle/phi/core/compat/CMakeLists.txt b/paddle/phi/core/compat/CMakeLists.txt
index c6bc9e15a535b52def1caef463a8a9228ab51e4a..3423e380970df8a69dc047325e80024dece1f914 100644
--- a/paddle/phi/core/compat/CMakeLists.txt
+++ b/paddle/phi/core/compat/CMakeLists.txt
@@ -1,14 +1,14 @@
-cc_library(arg_map_context SRCS arg_map_context.cc DEPS pten_enforce)
+cc_library(arg_map_context SRCS arg_map_context.cc DEPS phi_enforce)
 cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce)
 
 set(convert_utils_deps data_type place op_utils)
 
 if(WITH_GPU)
-  set(convert_utils_deps ${convert_utils_deps} pten_gpu_info)
+  set(convert_utils_deps ${convert_utils_deps} phi_gpu_info)
 elseif(WITH_ROCM)
-  set(convert_utils_deps ${convert_utils_deps} pten_gpu_info)
+  set(convert_utils_deps ${convert_utils_deps} phi_gpu_info)
 elseif(WITH_XPU)
-  set(convert_utils_deps ${convert_utils_deps} pten_xpu_info)
+  set(convert_utils_deps ${convert_utils_deps} phi_xpu_info)
 endif()
 if(WITH_CUSTOM_DEVICE)
   set(convert_utils_deps ${convert_utils_deps} device_manager)
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 39cb3fb5692679ccd624fd2d79bec2bbeb04d257..af29b3bab5c3cc4b2e1caeb4eee9689179464d01 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -79,7 +79,7 @@ class ArgumentMappingContext {
   virtual bool HasOutput(const std::string& name) const = 0;
   virtual bool HasAttr(const std::string& name) const = 0;
 
-  // now we can't use Attribute here, it will cause pten relay on
+  // now we can't use Attribute here, it will cause phi relay on
   // boost::variant and BlockDesc
   virtual paddle::any Attr(const std::string& name) const = 0;
 
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index a5b7b869b948dfb17b9f58a455bb336a4f021c4f..3b7a733ede90464328600ebd3c7d371314b99cc3 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 
 namespace phi {
 
-Backend TransToPtenBackend(const phi::Place& place) {
+Backend TransToPhiBackend(const phi::Place& place) {
   if (place.GetType() == phi::AllocationType::CPU) {
     return Backend::CPU;
   } else if (place.GetType() == phi::AllocationType::GPU) {
@@ -41,7 +41,7 @@ Backend TransToPtenBackend(const phi::Place& place) {
   }
 }
 
-phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id) {
+phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
   // NOTE(zhiqiu): GetCurrentDeviceId not always success, and device id is not
   // always needed.
   // So, add set_device_id parameter here.
@@ -58,7 +58,7 @@ phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id) {
       return phi::CPUPlace();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case phi::Backend::CUDNN:
+    case phi::Backend::GPUDNN:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #endif
@@ -87,21 +87,21 @@ phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id) {
   }
 }
 
-std::string TransToPtenKernelName(const std::string& fluid_op_name) {
+std::string TransToPhiKernelName(const std::string& fluid_op_name) {
   return OpUtilsMap::Instance().GetBaseKernelName(fluid_op_name);
 }
 
-const std::string& TransToFluidOpName(const std::string& pten_kernel_name) {
+const std::string& TransToFluidOpName(const std::string& phi_kernel_name) {
   auto& base_kernel_name_map = OpUtilsMap::Instance().base_kernel_name_map();
   auto it = std::find_if(base_kernel_name_map.begin(),
                          base_kernel_name_map.end(),
-                         [&pten_kernel_name](const auto& pair) {
-                           return pair.second == pten_kernel_name;
+                         [&phi_kernel_name](const auto& pair) {
+                           return pair.second == phi_kernel_name;
                          });
   if (it != base_kernel_name_map.end()) {
     return it->first;
   }
-  return pten_kernel_name;
+  return phi_kernel_name;
 }
 
 }  // namespace phi
diff --git a/paddle/phi/core/compat/convert_utils.h b/paddle/phi/core/compat/convert_utils.h
index 058f0ecdf7bc2b5c81a55eb1a6e94cb5ddc30296..621459764873e6681d57813b227076db0b44dd04 100644
--- a/paddle/phi/core/compat/convert_utils.h
+++ b/paddle/phi/core/compat/convert_utils.h
@@ -22,10 +22,10 @@ limitations under the License. */
 
 namespace phi {
 
-std::string TransToPtenKernelName(const std::string& fluid_op_name);
-const std::string& TransToFluidOpName(const std::string& pten_kernel_name);
+std::string TransToPhiKernelName(const std::string& fluid_op_name);
+const std::string& TransToFluidOpName(const std::string& phi_kernel_name);
 
-Backend TransToPtenBackend(const phi::Place& place);
-phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id = true);
+Backend TransToPhiBackend(const phi::Place& place);
+phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id = true);
 
 }  // namespace phi
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index ec810d4e16340862faaabe0799e19245551b44c3..bbf634b4b09b90a086505bc173b588d7da2e9668 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -37,7 +37,8 @@ const std::unordered_set<std::string> standard_kernel_suffixs({
  * after 2.0, and can no longer be occupied by the previously abandoned ops.
  * They are marked here uniformly.
  */
-const std::unordered_set<std::string> deprecated_op_names({"flatten",
+const std::unordered_set<std::string> deprecated_op_names({"diag",
+                                                           "flatten",
                                                            "flatten_grad",
                                                            "matmul",
                                                            "matmul_grad",
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index 75ff9cc28600373eb1f074c0ed91b774ec9ab85a..f84a2bd8d9c5d0634f29485fc07f649ea9fb1b9e 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -22,7 +22,7 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
 
   for (auto& pair : kernel_info_map) {
     PADDLE_ENFORCE_EQ(
-        KernelFactory::Instance().HasCompatiblePtenKernel(pair.first),
+        KernelFactory::Instance().HasCompatiblePhiKernel(pair.first),
         true,
         phi::errors::InvalidArgument(
             "The kernel %s is not ready for custom kernel registering.",
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index 44cb63e2b874bd2df9b034ecf9f03053d1888c94..7a0f50533360d71e8cd025a520d753c366c08edb 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -94,9 +94,9 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
                           bytes));
     bytes = requested_size;
   }
-  // TODO(paddle-dev): In case of the allocator of storage_ is different with
-  // the incoming allocator, we should re-alloc data using the incoming
-  // allocator.
+  // NOTE(paddle-dev): In case of the allocator of storage_ is different with
+  // the incoming allocator, we will re-alloc data using the incoming
+  // allocator. See DeviceContext.Alloc in core/device_context.cc.
   if (!holder_ || holder_->size() < bytes + meta_.offset) {
     meta_.offset = 0;
     VLOG(10) << "Allocate data with bytes: " << bytes;
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 6ce8bea35d9dd68353a6677b6e59d3e004c68185..29e7dc01f32db20e3756677fe8a48fcb138b3883 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -161,7 +161,7 @@ void* DenseTensor::mutable_data(const Place& place,
 /* @jim19930609: The following "mutable_data" only supports specific dtypes
    defined in OpProto. This part need another clean up once the data type across
    Fluid
-   and Pten get unified.
+   and Phi get unified.
    */
 template <typename T>
 inline T* DenseTensor::mutable_data(const DDim& dims,
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index 9c1d85251f8926141341ee6b8c15e29164894ee7..b139eb99dd4846adb3f7ef3a27507a2ca4478e6d 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -13,8 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/tensor_base.h"
+#include "paddle/phi/core/selected_rows.h"
 
 namespace phi {
 using DataType = paddle::experimental::DataType;
@@ -72,6 +73,7 @@ struct DeviceContext::Impl {
   }
 
   void* Alloc(TensorBase* tensor,
+              const Place& place,
               DataType dtype = DataType::UNDEFINED,
               size_t requested_size = 0) const {
     PADDLE_ENFORCE_NOT_NULL(
@@ -81,6 +83,12 @@ struct DeviceContext::Impl {
     if (dtype == DataType::UNDEFINED) {
       dtype = tensor->dtype();
     }
+    // NOTE(paddle-dev): In case of tensor has already hold allocation and
+    // is going to allocate allocation on new place, we will clear its holder
+    // firstly and then re-alloc it.
+    if (tensor->initialized() && tensor->place() != place) {
+      ClearHolder(tensor);
+    }
     auto* allocator =
         tensor->numel() == 0 ? zero_allocator_ : device_allocator_;
     return tensor->AllocateFrom(
@@ -88,9 +96,11 @@ struct DeviceContext::Impl {
   }
 
   template <typename T>
-  T* Alloc(TensorBase* tensor, size_t requested_size = 0) const {
+  T* Alloc(TensorBase* tensor,
+           const Place& place,
+           size_t requested_size = 0) const {
     DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
-    return static_cast<T*>(Alloc(tensor, dtype, requested_size));
+    return static_cast<T*>(Alloc(tensor, place, dtype, requested_size));
   }
 
   void* HostAlloc(TensorBase* tensor,
@@ -103,6 +113,9 @@ struct DeviceContext::Impl {
     if (dtype == DataType::UNDEFINED) {
       dtype = tensor->dtype();
     }
+    if (tensor->initialized() && tensor->place() != CPUPlace()) {
+      ClearHolder(tensor);
+    }
     auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_;
     return tensor->AllocateFrom(
         const_cast<Allocator*>(allocator), dtype, requested_size);
@@ -147,6 +160,19 @@ struct DeviceContext::Impl {
   }
 
  private:
+  void ClearHolder(TensorBase* tensor) const {
+    if (!tensor->initialized()) return;
+
+    if (DenseTensor::classof(tensor)) {
+      static_cast<DenseTensor*>(tensor)->clear();
+    } else if (SelectedRows::classof(tensor)) {
+      static_cast<SelectedRows*>(tensor)->mutable_value()->clear();
+    } else {
+      PADDLE_THROW(errors::Unimplemented(
+          "Only support DenseTensor and SelectedRows now."));
+    }
+  }
+
   const Allocator* device_allocator_{nullptr};
   const Allocator* host_allocator_{nullptr};
   const Allocator* zero_allocator_{nullptr};
@@ -168,7 +194,7 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
   impl_ = std::move(other.impl_);
 }
 
-DeviceContext& DeviceContext::operator=(DeviceContext&&) = default;
+DeviceContext& DeviceContext::operator=(DeviceContext&& other) = default;
 
 DeviceContext::~DeviceContext() = default;
 
@@ -199,12 +225,12 @@ const Allocator& DeviceContext::GetZeroAllocator() const {
 void* DeviceContext::Alloc(TensorBase* tensor,
                            DataType dtype,
                            size_t requested_size) const {
-  return impl_->Alloc(tensor, dtype, requested_size);
+  return impl_->Alloc(tensor, GetPlace(), dtype, requested_size);
 }
 
 template <typename T>
 T* DeviceContext::Alloc(TensorBase* tensor, size_t requested_size) const {
-  return impl_->Alloc<T>(tensor, requested_size);
+  return impl_->Alloc<T>(tensor, GetPlace(), requested_size);
 }
 
 void* DeviceContext::HostAlloc(TensorBase* tensor,
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index b31bedd958b4b5bfdf32e80ab81e44dd3307e520..be91409762635e8aabdd6953aa5527d94959e4b2 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -225,8 +225,8 @@ class KernelFactory {
 
   KernelNameMap& kernels() { return kernels_; }
 
-  bool HasCompatiblePtenKernel(const std::string& op_type) const {
-    return kernels_.find(TransToPtenKernelName(op_type)) != kernels_.end();
+  bool HasCompatiblePhiKernel(const std::string& op_type) const {
+    return kernels_.find(TransToPhiKernelName(op_type)) != kernels_.end();
   }
 
   const Kernel& SelectKernelOrThrowError(const std::string& kernel_name,
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 862f61b20400e674b26f1277caa11a56d85f4e73..2fda3cb6db4fdb4aaac7fc7c88075b833c050bad 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -245,6 +245,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc
index f2987e36d3db0163c275562562bf5d6bf7aa91af..ceaebe4e35b7120af160e27fca4347add941d458 100644
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -106,7 +106,7 @@ void SparseCooTensor::SetMember(const DenseTensor& non_zero_indices,
                                 const bool coalesced) {
   this->non_zero_indices_ = non_zero_indices;
   this->non_zero_elements_ = non_zero_elements;
-  this->dims_ = dims_;
+  this->dims_ = dims;
   this->coalesced_ = coalesced;
 }
 
diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h
index efb01d6664238f2dacf6a7860c41fd6ce58757f6..a190b222f86ac4145f7ad02eab043a03038c1096 100644
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -23,39 +23,39 @@ limitations under the License. */
 
 namespace phi {
 
-#define _PtenForEachDataTypeHelper_(callback, cpp_type, data_type) \
+#define _PhiForEachDataTypeHelper_(callback, cpp_type, data_type) \
   callback(cpp_type, data_type);
 
-#define _PtenForEachDataType_(callback)                             \
-  _PtenForEachDataTypeHelper_(callback, float, DataType::FLOAT32);  \
-  _PtenForEachDataTypeHelper_(                                      \
+#define _PhiForEachDataType_(callback)                              \
+  _PhiForEachDataTypeHelper_(callback, float, DataType::FLOAT32);   \
+  _PhiForEachDataTypeHelper_(                                       \
       callback, ::phi::dtype::float16, DataType::FLOAT16);          \
-  _PtenForEachDataTypeHelper_(                                      \
+  _PhiForEachDataTypeHelper_(                                       \
       callback, ::phi::dtype::bfloat16, DataType::BFLOAT16);        \
-  _PtenForEachDataTypeHelper_(callback, double, DataType::FLOAT64); \
-  _PtenForEachDataTypeHelper_(callback, int, DataType::INT32);      \
-  _PtenForEachDataTypeHelper_(callback, int64_t, DataType::INT64);  \
-  _PtenForEachDataTypeHelper_(callback, bool, DataType::BOOL);      \
-  _PtenForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8);  \
-  _PtenForEachDataTypeHelper_(callback, int16_t, DataType::INT16);  \
-  _PtenForEachDataTypeHelper_(callback, int8_t, DataType::INT8);    \
-  _PtenForEachDataTypeHelper_(                                      \
+  _PhiForEachDataTypeHelper_(callback, double, DataType::FLOAT64);  \
+  _PhiForEachDataTypeHelper_(callback, int, DataType::INT32);       \
+  _PhiForEachDataTypeHelper_(callback, int64_t, DataType::INT64);   \
+  _PhiForEachDataTypeHelper_(callback, bool, DataType::BOOL);       \
+  _PhiForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8);   \
+  _PhiForEachDataTypeHelper_(callback, int16_t, DataType::INT16);   \
+  _PhiForEachDataTypeHelper_(callback, int8_t, DataType::INT8);     \
+  _PhiForEachDataTypeHelper_(                                       \
       callback, ::phi::dtype::complex<float>, DataType::COMPLEX64); \
-  _PtenForEachDataTypeHelper_(                                      \
+  _PhiForEachDataTypeHelper_(                                       \
       callback, ::phi::dtype::complex<double>, DataType::COMPLEX128);
 
 template <typename Visitor>
 inline void VisitDataType(phi::DataType type, Visitor visitor) {
-#define PtenVisitDataTypeCallback(cpp_type, data_type) \
-  do {                                                 \
-    if (type == data_type) {                           \
-      visitor.template apply<cpp_type>();              \
-      return;                                          \
-    }                                                  \
+#define PhiVisitDataTypeCallback(cpp_type, data_type) \
+  do {                                                \
+    if (type == data_type) {                          \
+      visitor.template apply<cpp_type>();             \
+      return;                                         \
+    }                                                 \
   } while (0)
 
-  _PtenForEachDataType_(PtenVisitDataTypeCallback);
-#undef PtenVisitDataTypeCallback
+  _PhiForEachDataType_(PhiVisitDataTypeCallback);
+#undef PhiVisitDataTypeCallback
   PADDLE_THROW(phi::errors::Unimplemented(
       "Not supported phi::DataType(%d) as data type.", static_cast<int>(type)));
 }
diff --git a/paddle/phi/infermeta/CMakeLists.txt b/paddle/phi/infermeta/CMakeLists.txt
index c077e7b4c55636e07eaf9353d009e857c239b8ec..f7102629d213c08ecb3da1dfdd974e3354105e61 100644
--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils)
+cc_library(infermeta SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils)
 cc_library(backward_infermeta SRCS backward.cc DEPS meta_tensor convert_utils)
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index c4ae2e0b371c1336aeac69ec7eda208ce35e09d4..7d403fee94300e9517fcc517f4d088470d772e35 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -16,6 +16,54 @@ limitations under the License. */
 
 namespace phi {
 
+void BilinearTensorProductGradInferMeta(const MetaTensor& x,
+                                        const MetaTensor& y,
+                                        const MetaTensor& weight,
+                                        const MetaTensor& dout,
+                                        MetaTensor* dx,
+                                        MetaTensor* dy,
+                                        MetaTensor* dweight,
+                                        MetaTensor* dbias) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto weight_dims = weight.dims();
+  auto out_dims = dout.dims();
+
+  PADDLE_ENFORCE_EQ(
+      out_dims.size(),
+      2UL,
+      errors::InvalidArgument("The input(Out@GRAD) must be a 2D Tensor."));
+  PADDLE_ENFORCE_EQ(
+      x_dims[0],
+      out_dims[0],
+      errors::InvalidArgument(
+          "The first dimension(batch_size) of input(Out@GRAD) must be "
+          "equal to the first dimension of the Input(X)."));
+  PADDLE_ENFORCE_EQ(
+      weight_dims[0],
+      out_dims[1],
+      errors::InvalidArgument(
+          "The second dimension of input(Out@GRAD) must be equal to "
+          "the third dimension of the Input(Weight)."));
+
+  if (dx) {
+    dx->set_dims(x_dims);
+    dx->set_dtype(x.dtype());
+  }
+  if (dy) {
+    dy->set_dims(y_dims);
+    dy->set_dtype(y.dtype());
+  }
+  if (dweight) {
+    dweight->set_dims(weight_dims);
+    dweight->set_dtype(weight.dtype());
+  }
+  if (dbias) {
+    dbias->set_dims({1, out_dims[1]});
+    dbias->set_dtype(dout.dtype());
+  }
+}
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
@@ -28,4 +76,33 @@ void GeneralBinaryGradInferMeta(const MetaTensor& x,
   }
 }
 
+void GeneralTernaryGradInferMeta(const MetaTensor& x,
+                                 const MetaTensor& y,
+                                 const MetaTensor& z,
+                                 MetaTensor* dx,
+                                 MetaTensor* dy,
+                                 MetaTensor* dz) {
+  if (dx) {
+    dx->share_meta(x);
+  }
+  if (dy) {
+    dy->share_meta(y);
+  }
+  if (dz) {
+    dz->share_meta(z);
+  }
+}
+
+void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
+                                const MetaTensor& dout,
+                                int axis,
+                                MetaTensor* dx) {
+  PADDLE_ENFORCE_EQ(
+      out.dims(),
+      dout.dims(),
+      errors::InvalidArgument(
+          "Input(Out) and its gradients should have the same shape."));
+  dx->share_meta(dout);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 965c380db25ecc55c5cac072d593003e0dbe3334..c7090ed664b286e5a8d2c8e327f3c1ea37a71f04 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -20,9 +20,29 @@ limitations under the License. */
 
 namespace phi {
 
+void BilinearTensorProductGradInferMeta(const MetaTensor& x,
+                                        const MetaTensor& y,
+                                        const MetaTensor& weight,
+                                        const MetaTensor& dout,
+                                        MetaTensor* dx,
+                                        MetaTensor* dy,
+                                        MetaTensor* dweight,
+                                        MetaTensor* dbias);
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
                                 MetaTensor* dy);
 
+void GeneralTernaryGradInferMeta(const MetaTensor& x,
+                                 const MetaTensor& y,
+                                 const MetaTensor& z,
+                                 MetaTensor* dx,
+                                 MetaTensor* dy,
+                                 MetaTensor* dz);
+
+void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
+                                const MetaTensor& dout,
+                                int axis,
+                                MetaTensor* dx);
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 58cd43998b8a5eeba52f324dc1609d72c61ff95b..dfaabf7cae21ec9b91624211ce9b852148dd7cc2 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -310,6 +310,7 @@ void BCELossInferMeta(const MetaTensor& input,
   }
 
   out->set_dims(input_dims);
+  out->set_dtype(input.dtype());
   out->share_lod(input);
 }
 
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index d72033f95285738f20c75b5d2a678fe4811e8a18..7a0db3d5c17ee3cd40891601009a3841f603bb32 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -18,6 +18,72 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
 
+void BilinearTensorProductInferMeta(const MetaTensor& x,
+                                    const MetaTensor& y,
+                                    const MetaTensor& weight,
+                                    paddle::optional<const MetaTensor&> bias,
+                                    MetaTensor* out,
+                                    MetaConfig config) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto weight_dims = weight.dims();
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      2UL,
+      errors::InvalidArgument("The input(X) must be a 2D Tensor."));
+  PADDLE_ENFORCE_EQ(
+      y_dims.size(),
+      2UL,
+      errors::InvalidArgument("The input(Y) must be a 2D Tensor."));
+  PADDLE_ENFORCE_EQ(
+      weight_dims.size(),
+      3UL,
+      errors::InvalidArgument(
+          "Expected the input(Weight) is a 3D tensor. But received %dD tensor.",
+          weight_dims.size()));
+  if (config.is_runtime || (x_dims[0] > 0 && y_dims[0] > 0)) {
+    PADDLE_ENFORCE_EQ(x_dims[0],
+                      y_dims[0],
+                      errors::InvalidArgument(
+                          "The first dimension(batch_size) of input(X) must be "
+                          "equal to the first dimension of the input(Y)."));
+  }
+  PADDLE_ENFORCE_EQ(x_dims[1],
+                    weight_dims[1],
+                    errors::InvalidArgument(
+                        "The second dimension of input(X) must be equal to "
+                        "the second dimension of the input(Weight)."));
+  PADDLE_ENFORCE_EQ(y_dims[1],
+                    weight_dims[2],
+                    errors::InvalidArgument(
+                        "The second dimension of input(Y) must be equal to "
+                        "the third dimension of the input(Weight)."));
+
+  if (bias.get_ptr()) {
+    auto bias_dims = bias->dims();
+    PADDLE_ENFORCE_EQ(bias_dims.size(),
+                      2UL,
+                      errors::InvalidArgument(
+                          "The Input(Bias) must be a 2-D tensor with "
+                          "the 2nd dimension fixed to 1 (a row vector)."));
+    PADDLE_ENFORCE_EQ(bias_dims[0],
+                      1UL,
+                      errors::InvalidArgument(
+                          "The Input(Bias) must be a 2-D tensor with "
+                          "the 2nd dimension fixed to 1 (a row vector)."));
+    PADDLE_ENFORCE_EQ(bias_dims[1],
+                      weight_dims[0],
+                      errors::InvalidArgument(
+                          "The second dimension of input(Bias) must be equal "
+                          "to the first dimension of the input(Weight)."));
+  }
+
+  out->set_dims({x_dims[0], weight_dims[0]});
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
 void ConcatInferMeta(const std::vector<MetaTensor>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 589fc33333d0c3daf8291f75801b6484d8ddf053..a5fb2a4cbddc33b97b31a26fa29293868808875a 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,13 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
 
+void BilinearTensorProductInferMeta(const MetaTensor& x,
+                                    const MetaTensor& y,
+                                    const MetaTensor& weight,
+                                    paddle::optional<const MetaTensor&> bias,
+                                    MetaTensor* out,
+                                    MetaConfig config = MetaConfig());
+
 void ConcatInferMeta(const std::vector<MetaTensor>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
new file mode 100644
index 0000000000000000000000000000000000000000..52aeaef8438548542e2ecac4219f6eb2a8e8462b
--- /dev/null
+++ b/paddle/phi/infermeta/ternary.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/ternary.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+void AddmmInferMeta(const MetaTensor& input,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    float alpha,
+                    float beta,
+                    MetaTensor* out) {
+  auto input_dims = input.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto ndim_input = input_dims.size();
+  auto ndim_x = x_dims.size();
+  auto ndim_y = y_dims.size();
+
+  VLOG(3) << "addmm operator input.shape=" << input_dims
+          << " x.shape=" << x_dims << " y.shape=" << y_dims << " beta=" << beta
+          << " alpha=" << alpha << " ndim_input=" << ndim_input
+          << " ndim_x=" << ndim_x << " ndim_y=" << ndim_y;
+
+  PADDLE_ENFORCE_NE(
+      product(input_dims),
+      0,
+      errors::PreconditionNotMet("The Input variable 'input' has not "
+                                 "been initialized. You may need to confirm "
+                                 "if you put exe.run(startup_program) "
+                                 "after optimizer.minimize function."));
+
+  PADDLE_ENFORCE_NE(
+      product(x_dims),
+      0,
+      errors::PreconditionNotMet("The Input variable 'x' has not "
+                                 "been initialized. You may need to confirm "
+                                 "if you put exe.run(startup_program) "
+                                 "after optimizer.minimize function."));
+
+  PADDLE_ENFORCE_NE(
+      product(y_dims),
+      0,
+      errors::PreconditionNotMet("The Input variable 'y' has not "
+                                 "been initialized. You may need to confirm "
+                                 "if you put exe.run(startup_program) "
+                                 "after optimizer.minimize function."));
+  // dim check
+  PADDLE_ENFORCE_EQ(
+      ndim_input,
+      2,
+      errors::InvalidArgument("The input tensor input's dimension must be 2. "
+                              "But received input's dimension = [%s].",
+                              ndim_input));
+  PADDLE_ENFORCE_EQ(
+      ndim_x,
+      2,
+      errors::InvalidArgument("The input tensor x's dimension must be 2. "
+                              "But received x's dimension = [%s].",
+                              ndim_x));
+  PADDLE_ENFORCE_EQ(
+      ndim_y,
+      2,
+      errors::InvalidArgument("The input tensor y's dimension must be 2. "
+                              "But received y's dimension = [%s].",
+                              ndim_y));
+
+  std::vector<int64_t> output_dims;
+  output_dims.push_back(x_dims[0]);
+  output_dims.push_back(y_dims[1]);
+
+  out->set_dims(make_ddim(output_dims));
+  out->share_lod(input);
+  out->set_dtype(input.dtype());
+}
+
+}  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6223dd87aaf8e8c20c00ad72523e160ee15faee
--- /dev/null
+++ b/paddle/phi/infermeta/ternary.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace phi {
+
+// Common InferMeta Functions for ternary operators, The format like:
+//
+//   1. void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
+//                                          const MetaTensor& y,
+//                                          const MetaTensor& z,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//   Because functions in this file not only can infer shape, but also need
+//   infer lod or other useful data.
+
+void AddmmInferMeta(const MetaTensor& input,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    float alpha,
+                    float beta,
+                    MetaTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index ca71d6a56d8e785ab18e047e6ae552f5994cc0f0..49fd0a343a470f2545fc563366256f4f92294297 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/unary.h"
 
+#include <algorithm>
 #include <set>
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/enforce.h"
@@ -26,6 +27,30 @@ void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->share_meta(x);
 }
 
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out) {
+  auto rank = x.dims().size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -rank,
+      errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  PADDLE_ENFORCE_LT(
+      axis,
+      rank,
+      phi::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  out->share_meta(x);
+}
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
@@ -74,12 +99,42 @@ void FlattenInferMeta(const MetaTensor& x,
   }
 }
 
+void GumbelSoftmaxInferMeta(const MetaTensor& x,
+                            float temperature,
+                            bool hard,
+                            int axis,
+                            MetaTensor* out) {
+  UnchangedInferMetaCheckAxis(x, axis, out);
+}
+
 void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) {
   out->set_dims(x.dims());
   out->set_dtype(out_dtype);
   out->set_layout(x.layout());
 }
 
+void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) {
+  auto dims = x.dims();
+  auto rank = dims.size();
+  PADDLE_ENFORCE_GE(rank,
+                    2,
+                    errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions. But "
+                        "received a %d dimension tensor.",
+                        rank));
+  PADDLE_ENFORCE_EQ(
+      dims[rank - 2],
+      dims[rank - 1],
+      errors::InvalidArgument(
+          "The inner-most 2 dimensions of Input(X) all should be symmetric "
+          "positive-definite matrices and have the same size. But received "
+          "X's shape[-2] = %d and shape[-1] = %d.",
+          dims[rank - 2],
+          dims[rank - 1]));
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+}
+
 void CopyToInferMeta(const MetaTensor& x,
                      Backend backend,
                      bool blocking,
@@ -93,6 +148,18 @@ void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) {
   out->set_layout(x.layout());
 }
 
+void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      product(x.dims()),
+      1UL,
+      errors::InvalidArgument("The number of elements in Input(X) should be 1."
+                              "Now the number is %d.",
+                              product(x.dims())));
+  out->set_dims(x.dims());
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
 static phi::DDim ValidateShape(const std::vector<int64_t> shape,
                                const phi::DDim& in_dims) {
   const int64_t in_size = phi::product(in_dims);
@@ -233,6 +300,41 @@ void InferMetaFromVecValue(const MetaTensor& x,
   }
 }
 
+void MultinomialInferMeta(const MetaTensor& x,
+                          int num_samples,
+                          bool replacement,
+                          MetaTensor* out) {
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  PADDLE_ENFORCE_GT(x_rank,
+                    0,
+                    errors::InvalidArgument(
+                        "The number of dimensions of the input probability "
+                        "distribution should be > 0, but got %d.",
+                        x_rank));
+  PADDLE_ENFORCE_LE(x_rank,
+                    2,
+                    errors::InvalidArgument(
+                        "The number of dimensions of the input probability "
+                        "distribution should be <= 2, but got %d.",
+                        x_rank));
+
+  std::vector<int64_t> out_dims(x_rank);
+  for (int64_t i = 0; i < x_rank - 1; i++) {
+    out_dims[i] = x_dim[i];
+  }
+
+  PADDLE_ENFORCE_GT(
+      num_samples,
+      0,
+      errors::InvalidArgument(
+          "The number of samples should be > 0, but got %d.", num_samples));
+  out_dims[x_rank - 1] = num_samples;
+
+  out->set_dims(make_ddim(out_dims));
+  out->set_dtype(DataType::INT64);
+}
+
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
                       MetaTensor* out,
@@ -715,6 +817,96 @@ void UnfoldInferMeta(const MetaTensor& x,
   out->set_dims(phi::make_ddim(out_dims));
 }
 
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+
+  if (x_dims.size() == 1UL) {
+    int64_t size_ = x_dims[0] + std::abs(offset);
+    out->set_dims({size_, size_});
+    out->set_dtype(x.dtype());
+  } else if (x_dims.size() == 2UL) {
+    int64_t size_ = 0;
+    if (offset >= 0) {
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] < x_dims[1] - offset) {
+        size_ = x_dims[0];
+      } else {
+        size_ = x_dims[1] - offset;
+      }
+    } else {
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] + offset < x_dims[1]) {
+        size_ = x_dims[0] + offset;
+      } else {
+        size_ = x_dims[1];
+      }
+    }
+    out->set_dims({size_});
+    out->set_dtype(x.dtype());
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "The input tensor X's dimensions of DiagV2Op should be either 1 or "
+        "2, but received %d.",
+        x_dims.size()));
+  }
+}
+
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
+  out->set_dtype(DataType::INT64);
+  out->set_dims({1});
+}
+
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input should be a 4-D tensor of format [N, C, H, W] "
+                        "or [N, H, W, C], but got %u.",
+                        input_dims.size()));
+
+  const bool channel_last = (data_format == "NHWC");
+
+  if (!channel_last) {
+    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[3]));
+  }
+  auto output_dims = input_dims;
+  output_dims[0] = input_dims[0];
+  if (!channel_last) {
+    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] * upscale_factor;
+  } else {
+    output_dims[1] = input_dims[1] * upscale_factor;
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(output_dims);
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 21cbe76bb13c0e372668466e1ba0ed415c77f660..4fab1ec68ec1e71af5e55a9852cd68deccc09a7c 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -34,13 +34,26 @@ class MetaConfig;
 
 void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
 
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out);
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
                       MetaTensor* out);
 
+void GumbelSoftmaxInferMeta(const MetaTensor& x,
+                            float temperature,
+                            bool hard,
+                            int axis,
+                            MetaTensor* out);
+
 void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out);
 
+void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out);
+
 void CopyToInferMeta(const MetaTensor& x,
                      Backend backend,
                      bool blocking,
@@ -48,10 +61,17 @@ void CopyToInferMeta(const MetaTensor& x,
 
 void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
 
+void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out);
+
 void InferMetaFromVecValue(const MetaTensor& x,
                            const std::vector<int64_t>& shape,
                            MetaTensor* out);
 
+void MultinomialInferMeta(const MetaTensor& x,
+                          int num_samples,
+                          bool replacement,
+                          MetaTensor* out);
+
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
                       MetaTensor* out,
@@ -104,4 +124,16 @@ void UnfoldInferMeta(const MetaTensor& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out);
+
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index ef085e71f5dcc295a417f0c6aa83fc7cdfc20a8d..ef51d6daf6a0052f39c2cf6253c208412cbb6904 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -3,22 +3,30 @@ set(kernel_declare_file_final ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declaratio
 file(WRITE ${kernel_declare_file} "// Generated by the paddle/phi/kernels/CMakeLists.txt.  DO NOT EDIT!\n\n#pragma once\n\n")
 file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h\"\n\n")
 
-# pten functors and functions called by kernels
+# phi functors and functions called by kernels
 add_subdirectory(funcs)
 
-# pten depends all pten kernel targets
-set_property(GLOBAL PROPERTY PTEN_KERNELS "")
+# phi depends all phi kernel targets
+set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax)
 # remove this dep after removing fluid deps on tensor creation
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
 
-# auto build kernel targets by cmake
-register_kernels(DEPS ${COMMON_KERNEL_DEPS})
+# NOTE: Some kernels depend on some targets that are not commonly used.
+# These targets are not suitable for common dependencies.
+# In this case, you need to manually generate them here.
+set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel)
+kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
+kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 
-# pten sparse kernels
+# auto parse and build kernel targets by cmake
+register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS})
+
+# phi sparse kernels
 add_subdirectory(sparse)
 
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
diff --git a/paddle/phi/kernels/addmm_grad_kernel.h b/paddle/phi/kernels/addmm_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d2f445a61de0cb186bdd7fbe7a8a7c0bce2869e
--- /dev/null
+++ b/paddle/phi/kernels/addmm_grad_kernel.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddmmGradKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     float alpha,
+                     float beta,
+                     DenseTensor* input_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/diag_v2_op.h b/paddle/phi/kernels/addmm_kernel.h
similarity index 55%
rename from paddle/fluid/operators/diag_v2_op.h
rename to paddle/phi/kernels/addmm_kernel.h
index f0bf04badab79db3ff6c72ea47f4b212832c041f..3674305796cde35f164289f5f405fee4c30e1216 100644
--- a/paddle/fluid/operators/diag_v2_op.h
+++ b/paddle/phi/kernels/addmm_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,21 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
-using DDim = framework::DDim;
+template <typename T, typename Context>
+void AddmmKernel(const Context& dev_ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 float alpha,
+                 float beta,
+                 DenseTensor* out);
 
-static inline int ComputeStride(int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis + 1; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h b/paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..499aa1e0b2ea958935c20bc9bbcde89d6a15d9a4
--- /dev/null
+++ b/paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BilinearTensorProductGradKernel(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& y,
+                                     const DenseTensor& weight,
+                                     const DenseTensor& dout,
+                                     DenseTensor* dx,
+                                     DenseTensor* dy,
+                                     DenseTensor* dweight,
+                                     DenseTensor* dbias);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bilinear_tensor_product_kernel.h b/paddle/phi/kernels/bilinear_tensor_product_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b34e8946ddd58e0431e52804d0f621d3eb25720c
--- /dev/null
+++ b/paddle/phi/kernels/bilinear_tensor_product_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BilinearTensorProductKernel(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& weight,
+                                 paddle::optional<const DenseTensor&> bias,
+                                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/eye_op.cu b/paddle/phi/kernels/cholesky_grad_kernel.h
similarity index 52%
rename from paddle/fluid/operators/eye_op.cu
rename to paddle/phi/kernels/cholesky_grad_kernel.h
index 8d55235a54c70b1a4db4bd7f355332c923207591..b170a3d7ffcfacdf8186d0f54450a38b536949d5 100644
--- a/paddle/fluid/operators/eye_op.cu
+++ b/paddle/phi/kernels/cholesky_grad_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eye_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-namespace plf = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    eye, ops::EyeKernel<plf::CUDADeviceContext, float>,
-    ops::EyeKernel<plf::CUDADeviceContext, double>,
-    ops::EyeKernel<plf::CUDADeviceContext, int64_t>,
-    ops::EyeKernel<plf::CUDADeviceContext, int>,
-    ops::EyeKernel<plf::CUDADeviceContext, paddle::platform::float16>);
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out,
+                        const DenseTensor& out_grad,
+                        bool upper,
+                        DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/size_op.cu b/paddle/phi/kernels/cholesky_kernel.h
similarity index 56%
rename from paddle/fluid/operators/size_op.cu
rename to paddle/phi/kernels/cholesky_kernel.h
index de56ecd95270577689f699462b9273b43f34595e..5dc1473d8dbcad895abefccb7d034d686eed1775 100644
--- a/paddle/fluid/operators/size_op.cu
+++ b/paddle/phi/kernels/cholesky_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/size_op.h"
+#pragma once
 
-REGISTER_OP_CUDA_KERNEL(
-    size, paddle::operators::SizeKernel<int>,
-    paddle::operators::SizeKernel<int64_t>,
-    paddle::operators::SizeKernel<paddle::platform::float16>,
-    paddle::operators::SizeKernel<float>, paddle::operators::SizeKernel<bool>,
-    paddle::operators::SizeKernel<double>);
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/copy_kernel.h b/paddle/phi/kernels/copy_kernel.h
index a3ba6eabcdd694aa5bfca4a0ee669ccca086e78f..95df29f7e653af4d27fccc009da1fcdaa2264f59 100644
--- a/paddle/phi/kernels/copy_kernel.h
+++ b/paddle/phi/kernels/copy_kernel.h
@@ -22,6 +22,7 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
           const DenseTensor& src,
+          Place dst_place,
           bool blocking,
           DenseTensor* dst);
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index 71d818c45e6f3f28697d3496cc9ae8a0d209ce6e..efe7d090405df72ce07b2b2bb7f045977d982eff 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/abs_kernel.h"
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -29,7 +29,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
       out, size_t(x.numel() * sizeof(phi::funcs::Real<T>)));
   auto* out_data = out->data<phi::funcs::Real<T>>();
 
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::funcs::AbsFunctor<T> functor(x_data, out_data, numel);
   for_range(functor);
 }
diff --git a/paddle/fluid/operators/addmm_op.cu b/paddle/phi/kernels/cpu/addmm_grad_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/addmm_op.cu
rename to paddle/phi/kernels/cpu/addmm_grad_kernel.cc
index e42d9c84f9234a756362acd67029b2ace4f6c9fb..6032f15e0f75e87fc491212361f77d46f98c9ea3 100644
--- a/paddle/fluid/operators/addmm_op.cu
+++ b/paddle/phi/kernels/cpu/addmm_grad_kernel.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/addmm_op.h"
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
 
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h"
 
-REGISTER_OP_CUDA_KERNEL(addmm, ops::AddMMKernel<plat::CUDADeviceContext, float>,
-                        ops::AddMMKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(addmm_grad,
-                        ops::AddMMGradKernel<plat::CUDADeviceContext, float>,
-                        ops::AddMMGradKernel<plat::CUDADeviceContext, double>);
+PD_REGISTER_KERNEL(
+    addmm_grad, CPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/addmm_kernel.cc b/paddle/phi/kernels/cpu/addmm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff86b655ed3ef2d195c5d6c6e49883f364bcc2e6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/addmm_kernel.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(addmm, CPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2268212316af68433a18d9037136e3e0f733e4dc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(bilinear_tensor_product_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BilinearTensorProductGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25bc5913865a0717024c3bfe24281ab3b110b159
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bilinear_tensor_product_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(bilinear_tensor_product,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BilinearTensorProductKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad9d51db4921e263fd959271e053a8324c52bb64
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc
@@ -0,0 +1,22 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    cholesky_grad, CPU, ALL_LAYOUT, phi::CholeskyGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_kernel.cc b/paddle/phi/kernels/cpu/cholesky_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d9b6b52d75d6924e091c733f2a051f9281b83b2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_kernel.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/cholesky_kernel.h"
+
+#include "Eigen/Cholesky"
+#include "Eigen/Core"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out) {
+  using EigenMatrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using InputMatrixMap = Eigen::Map<const EigenMatrix>;
+  using OutputMatrixMap = Eigen::Map<EigenMatrix>;
+
+  auto& dims = x.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  auto m = dims[dims.size() - 1];
+
+  const auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  // Cholesky decomposition for each matrix, maybe can use multi threads
+  for (int i = 0; i < batch_count; i++) {
+    auto input = InputMatrixMap(x_data + i * m * m, m, m);
+    auto output = OutputMatrixMap(out_data + i * m * m, m, m);
+    if (upper) {
+      Eigen::LLT<
+          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
+          Eigen::UpLoType::Upper>
+          llt_decomposition(input);
+      PADDLE_ENFORCE_EQ(llt_decomposition.info(),
+                        Eigen::Success,
+                        errors::InvalidArgument(
+                            "Cholesky decomposition was not successful. The "
+                            "%d-th input matrice "
+                            "might not be not be positive definite.",
+                            i));
+      output = llt_decomposition.matrixU();
+    } else {
+      Eigen::LLT<
+          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
+          Eigen::UpLoType::Lower>
+          llt_decomposition(input);
+      PADDLE_ENFORCE_EQ(llt_decomposition.info(),
+                        Eigen::Success,
+                        errors::InvalidArgument(
+                            "Cholesky decomposition was not successful. The "
+                            "%d-th input matrice "
+                            "might not be not be positive definite.",
+                            i));
+      output = llt_decomposition.matrixL();
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cholesky, CPU, ALL_LAYOUT, phi::CholeskyKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc
index 7dcd75d39e4df5b7bc634c4e16f7843bf5044c94..1af071f23ddc520e6733acdbeec3a0652f4e1d8f 100644
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -28,6 +28,7 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
           const DenseTensor& src,
+          Place dst_place,
           bool blocking,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1e0b8e31e78fd74e6a15722546971a3cb72807a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/diag_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int offset,
+                float padding_value,
+                DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  auto x_dims = x.dims();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  auto out_dims = out->dims();
+
+  int64_t i;
+  if (x_dims.size() == 1) {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+    auto x_length = x_dims[0];
+    const int& x_stride = phi::funcs::ComputeStride(0, x_dims);
+
+    auto out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
+    auto out_stride_1 = phi::funcs::ComputeStride(1, out_dims);
+    out_data += (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+    for (i = 0; i < x_length; i++) {
+      out_data[i * (out_stride_0 + out_stride_1)] = x_data[i * x_stride];
+    }
+  } else {
+    auto out_length = out_dims[0];
+    const int& x_stride_0 = phi::funcs::ComputeStride(0, x_dims);
+    const int& x_stride_1 = phi::funcs::ComputeStride(1, x_dims);
+
+    auto out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
+    x_data += (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+    for (i = 0; i < out_length; i++) {
+      out_data[i * out_stride_0] = x_data[i * (x_stride_0 + x_stride_1)];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    diag, CPU, ALL_LAYOUT, phi::DiagKernel, int, float, double, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index 0b29091367c83acee19e703f450d16602f322f3c..c878e8133ffc0dc0c5e4992b315af48bc6cdaf03 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -170,6 +170,7 @@ PD_REGISTER_KERNEL(subtract_grad,
                    int16_t,
                    int,
                    int64_t,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -182,5 +183,6 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    int16_t,
                    int,
                    int64_t,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1fe4f026ab07ef2370c2c69ac10a3a9c831c6a3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/erfinv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    erfinv_grad, CPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/erfinv_kernel.cc b/paddle/phi/kernels/cpu/erfinv_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f3a740f9d9be3e68c5e7d3a13933d6b09cdbc75
--- /dev/null
+++ b/paddle/phi/kernels/cpu/erfinv_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/erfinv_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(erfinv, CPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/eye_kernel.cc b/paddle/phi/kernels/cpu/eye_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0d0f2c43909690078ff268356242b557dd6e6aa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eye_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eye_kernel.h"
+#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(eye,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EyeKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index 6b0183d31c6ec3dc3e6712043f27678c3f3a6bb2..86576a861aa4834a4b39b50594565a2d4b3ac510 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -35,7 +35,7 @@ void FullKernel(const Context& dev_ctx,
                 const Scalar& val,
                 DataType dtype,
                 DenseTensor* out) {
-  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
+  out->Resize(phi::make_ddim(shape.GetData()));
   FullValue<T>(dev_ctx, out, val.to<T>());
 }
 
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4c131e72b59a9b6a975dbb7f43d33321ae9a549
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(gumbel_softmax_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GumbelSoftmaxGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb406665c5f4f63a67ea84f5516b93fc82644e67
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gumbel_softmax_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct GumbleNoiseGenerator<CPUContext, T> {
+  static void Transform(const CPUContext& ctx,
+                        const T* input_data,
+                        T* output_data,
+                        int size_to_axis,
+                        int size_from_axis,
+                        const float temperature) {
+    // generate uniform random number
+    const int size = size_to_axis * size_from_axis;
+    std::uniform_real_distribution<T> dist(0.00001, 1);
+    auto engine = ctx.GetGenerator()->GetCPUEngine();
+    DenseTensor random_tensor;
+    random_tensor.Resize(make_ddim({size}));
+    auto* random_data = ctx.template Alloc<T>(&random_tensor);
+    for (int64_t i = 0; i < size; ++i) {
+      random_data[i] = dist(*engine);
+    }
+
+    // generate gumbel noise
+    DDim dim_2d{size_to_axis, size_from_axis};
+    auto gumbel_noise_eigen = EigenMatrix<T>::From(random_tensor, dim_2d);
+    gumbel_noise_eigen = -(((-(gumbel_noise_eigen.log())).log()));
+
+    // add noise
+    for (int64_t i = 0; i < size_to_axis * size_from_axis; i++) {
+      output_data[i] = (input_data[i] + random_data[i]) / temperature;
+    }
+  }
+};
+
+template <typename T>
+struct OneHotGenerator<CPUContext, T> {
+  static void Transform(const CPUContext& ctx,
+                        const DenseTensor& x,
+                        DenseTensor* out,
+                        int axis) {
+    DenseTensor index;
+    std::vector<int> index_dim;
+    const auto rank = x.dims().size();
+    const int size_to_axis = funcs::SizeToAxis(axis, x.dims());
+    const int size_from_axis = funcs::SizeFromAxis(axis, x.dims());
+    const int size_out_axis = funcs::SizeOutAxis(axis, x.dims());
+
+    for (int i = 0; i < x.dims().size(); i++) {
+      if (i != axis) index_dim.push_back(x.dims().Get()[i]);
+    }
+    DDim index_ddim(index_dim.data(), rank - 1);
+    index.Resize(index_ddim);
+    auto* index_data = ctx.template Alloc<int>(&index);
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)               \
+  ArgMaxFunctor<CPUContext, T, rank> functor##rank; \
+  functor##rank(ctx, *out, &index, axis);
+    switch (out->dims().size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_ENFORCE_LE(
+            out->dims().size(),
+            6,
+            errors::InvalidArgument("gumbel_softmax operator doesn't supports "
+                                    "tensors whose ranks are greater "
+                                    "than 6 in CPU mode."));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+
+    funcs::set_constant(ctx, out, 0.0);
+    for (int i = 0; i < size_to_axis; i++) {
+      for (int j = 0; j < size_out_axis; j++) {
+        *(out->data<T>() + i * size_from_axis + j +
+          index_data[i * size_out_axis + j] * size_out_axis) = 1.0;
+      }
+    }
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gumbel_softmax, CPU, ALL_LAYOUT, phi::GumbelSoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/increment_kernel.cc b/paddle/phi/kernels/cpu/increment_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70c178d25a10ab9e65cc4fbbc8f5f3a5176c17ca
--- /dev/null
+++ b/paddle/phi/kernels/cpu/increment_kernel.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/increment_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/increment_kernel_impl.h"
+
+PD_REGISTER_KERNEL(increment,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IncrementKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc
index 581c5f90f35e5cadb239291d143ce54d499c017e..5cfcfe62c7816c84a4f2876942b4d9b30dfad167 100644
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
@@ -139,7 +139,8 @@ PD_REGISTER_KERNEL(subtract_raw,
                    int,
                    int64_t,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(divide_raw,
                    CPU,
                    ALL_LAYOUT,
@@ -160,7 +161,8 @@ PD_REGISTER_KERNEL(multiply_raw,
                    int64_t,
                    bool,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(sum_raw,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/multinomial_kernel.cc b/paddle/phi/kernels/cpu/multinomial_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9c2a569e0650dececa4541b3fdc7eba9b3f022e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multinomial_kernel.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/multinomial_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/multinomial_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultinomialKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int num_samples,
+                       bool replacement,
+                       DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(out);
+  auto in_dims = x.dims();
+  int64_t in_rank = in_dims.size();
+  const int64_t num_categories = in_dims[in_rank - 1];
+  const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
+
+  funcs::MultinomialFunctor<T>(dev_ctx,
+                               out_data,
+                               in_data,
+                               num_samples,
+                               replacement,
+                               num_categories,
+                               num_distributions);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    multinomial, CPU, ALL_LAYOUT, phi::MultinomialKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/mv_grad_kernel.cc b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3b7f94be41948267ed486a5109ffcc2d6db99fb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mv_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MvGradKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& vec,
+                  const DenseTensor& out_grad,
+                  DenseTensor* x_grad,
+                  DenseTensor* vec_grad) {
+  auto dout = out_grad;
+  auto dx = x_grad;
+  auto dvec = vec_grad;
+
+  auto dim_x = x.dims();
+  int m = dim_x[0];
+  int n = dim_x[1];
+
+  // get data ptr
+  const T* x_data = x.data<T>();
+  const T* vec_data = vec.data<T>();
+  const T* dout_data = dout.data<T>();
+
+  if (dx) {
+    T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+    for (int i = 0; i < m; ++i) {
+      for (int j = 0; j < n; ++j) {
+        dx_data[i * n + j] = dout_data[i] * vec_data[j];
+      }
+    }
+  }
+
+  if (dvec) {
+    T* dvec_data = dev_ctx.template Alloc<T>(dvec);
+
+    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+    blas.GEMV(true,
+              dim_x[0],
+              dim_x[1],
+              static_cast<T>(1),
+              x_data,
+              dout_data,
+              static_cast<T>(0),
+              dvec_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mv_grad, CPU, ALL_LAYOUT, phi::MvGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/mv_kernel.cc b/paddle/phi/kernels/cpu/mv_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f76ddda6dde5ba686fa7403910a245644a16f2d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mv_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mv_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/impl/mv_kernel_impl.h"
+
+PD_REGISTER_KERNEL(mv, CPU, ALL_LAYOUT, phi::MvKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b32065d4f0a145c382648cc1f192b032f7df0802
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pixel_shuffle_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PixelShuffleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80f8fa7b50efb7f2e685b7e202d89c0f9a382a18
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pixel_shuffle_kernel.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    pixel_shuffle, CPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/randint_kernel.cc b/paddle/phi/kernels/cpu/randint_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..feb418949ba40d3bf553c2df0b4300cc686a0ef7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/randint_kernel.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/randint_kernel.h"
+
+#include <random>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandintRawKernel(const Context& dev_ctx,
+                      int low,
+                      int high,
+                      const ScalarArray& shape,
+                      DataType dtype,
+                      int seed,
+                      DenseTensor* out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  T* data = dev_ctx.template Alloc<T>(out);
+  auto numel = out->numel();
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
+  }
+  std::uniform_int_distribution<T> dist(low, high - 1);
+  for (int64_t i = 0; i < numel; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <typename T, typename Context>
+void RandintKernel(const Context& dev_ctx,
+                   int low,
+                   int high,
+                   const ScalarArray& shape,
+                   DataType dtype,
+                   DenseTensor* out) {
+  RandintRawKernel<T>(dev_ctx, low, high, shape, dtype, 0, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    randint_raw, CPU, ALL_LAYOUT, phi::RandintRawKernel, int, int64_t) {}
+
+PD_REGISTER_KERNEL(randint, CPU, ALL_LAYOUT, phi::RandintKernel, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/randperm_kernel.cc b/paddle/phi/kernels/cpu/randperm_kernel.cc
index 28092c8df6d153c6b5e787027f0c2239bd257cc1..6cb435f53b85bd22afba1a0d31b16ecd4c27204b 100644
--- a/paddle/phi/kernels/cpu/randperm_kernel.cc
+++ b/paddle/phi/kernels/cpu/randperm_kernel.cc
@@ -13,20 +13,23 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/randperm_kernel.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
 template <typename T, typename Context>
-void RandpermKernel(const Context& ctx,
-                    int n,
-                    DataType dtype,
-                    DenseTensor* out) {
-  T* out_data = ctx.template Alloc<T>(out);
-  auto gen_ptr = ctx.GetHostGenerator();
-  auto engine = gen_ptr->GetCPUEngine();
+void RandpermRawKernel(
+    const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
+  }
 
   for (int i = 0; i < n; ++i) {
     out_data[i] = static_cast<T>(i);
@@ -34,8 +37,25 @@ void RandpermKernel(const Context& ctx,
   std::shuffle(out_data, out_data + n, *engine);
 }
 
+template <typename T, typename Context>
+void RandpermKernel(const Context& dev_ctx,
+                    int n,
+                    DataType dtype,
+                    DenseTensor* out) {
+  RandpermRawKernel<T>(dev_ctx, n, dtype, 0, out);
+}
+
 }  // namespace phi
 
+PD_REGISTER_KERNEL(randperm_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RandpermRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(randperm,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff34ef26f6bd3aea13815cb347719f054fd0a058
--- /dev/null
+++ b/paddle/phi/kernels/cpu/size_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/size_kernel.h"
+#include "paddle/phi/kernels/impl/size_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(size,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SizeKernel,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/softmax_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ef90f9c6762d680a00e8841a20ff4ddcd5abe28a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/softmax_grad_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    softmax_grad, CPU, ALL_LAYOUT, phi::SoftmaxGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/softmax_kernel.cc b/paddle/phi/kernels/cpu/softmax_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..537b4326681a175fbad7593eed1d8b6caee9d86c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/softmax_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    softmax, CPU, ALL_LAYOUT, phi::SoftmaxRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ebc032ef54538188d8e287673c0d31fae9ad197b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
+
+#include <limits>
+#include <random>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const ScalarArray& shape,
+                                   float mean,
+                                   float std,
+                                   int seed,
+                                   DataType dtype,
+                                   DenseTensor* out) {
+  auto tensor = out;
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                         1.0);
+  TruncatedNormal<T> truncated_normal(mean, std);
+  int64_t size = tensor->numel();
+
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = truncated_normal(dist(*engine));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(truncated_gaussian_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TruncatedGaussianRandomKernel,
+                   float) {}
diff --git a/paddle/phi/kernels/cpu/unbind_kernel.cc b/paddle/phi/kernels/cpu/unbind_kernel.cc
index 655f8c8aafbf201dc07db0fa1af79605c2a76763..39cc2f8fc4662a0893fb8b73b138a52b810f59b8 100644
--- a/paddle/phi/kernels/cpu/unbind_kernel.cc
+++ b/paddle/phi/kernels/cpu/unbind_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/unbind_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
 
diff --git a/paddle/phi/kernels/diag_kernel.h b/paddle/phi/kernels/diag_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8dc919fa633606ce30657cb20a59fbf615e3e15a
--- /dev/null
+++ b/paddle/phi/kernels/diag_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int offset,
+                float padding_value,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/diagonal_kernel.h b/paddle/phi/kernels/diagonal_kernel.h
index f233ba2a956276ad22819d49c30fbcbaf8a671c3..7cf7282307a4b91a771441d3218121b606afdf81 100644
--- a/paddle/phi/kernels/diagonal_kernel.h
+++ b/paddle/phi/kernels/diagonal_kernel.h
@@ -25,4 +25,4 @@ void DiagonalKernel(const Context& dev_ctx,
                     int axis1,
                     int axis2,
                     DenseTensor* out);
-}  // pten
+}  // phi
diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h
index b95d98895aa8edda497a730281603028b98bc4f0..38912a5ccc442b6ea5fb484b708754dd706ae706 100644
--- a/paddle/phi/kernels/digamma_grad_kernel.h
+++ b/paddle/phi/kernels/digamma_grad_kernel.h
@@ -24,4 +24,4 @@ void DigammaGradKernel(const Context& ctx,
                        const DenseTensor& x,
                        DenseTensor* x_grad);
 
-}  // namepsace pten
+}  // namepsace phi
diff --git a/paddle/phi/kernels/digamma_kernel.h b/paddle/phi/kernels/digamma_kernel.h
index 1772a33e4ee4cd88d80705971462b632c1015c3c..ce25f2e148e963054fcfa2a51321954b45a4297b 100644
--- a/paddle/phi/kernels/digamma_kernel.h
+++ b/paddle/phi/kernels/digamma_kernel.h
@@ -21,4 +21,4 @@ namespace phi {
 template <typename T, typename Context>
 void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
 
-}  // namepsace pten
+}  // namepsace phi
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 8109d3879cb21edd85d19612a62d9a8e0711e456..6e5f15fe1692b473965f96f68fd86fad87f1892e 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -25,7 +25,8 @@ void EmptyKernel(const Context& dev_ctx,
                  const ScalarArray& shape,
                  DataType dtype,
                  DenseTensor* out) {
-  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
+  out->Resize(phi::make_ddim(shape.GetData()));
+  dev_ctx.template Alloc<T>(out);
 }
 
 template <typename T, typename Context>
@@ -68,7 +69,9 @@ PD_REGISTER_KERNEL(empty_like,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(empty,
@@ -100,5 +103,7 @@ PD_REGISTER_KERNEL(empty_like,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 #endif
diff --git a/paddle/phi/kernels/empty_kernel.h b/paddle/phi/kernels/empty_kernel.h
index 54ba8b16c1d7409915f11411e99abaac03586aec..0b8d95ee94fb5480684023ec6c71698ba06d9c13 100644
--- a/paddle/phi/kernels/empty_kernel.h
+++ b/paddle/phi/kernels/empty_kernel.h
@@ -54,22 +54,20 @@ DenseTensor Empty(const Context& dev_ctx) {
 }
 
 template <typename T, typename Context>
-DenseTensor Empty(const Context& dev_ctx,
-                  const ScalarArray& shape,
-                  DataType dtype = DataType::FLOAT32) {
+DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
   auto dense_out = Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
+  DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
   EmptyKernel<T, Context>(dev_ctx, shape, dtype, &dense_out);
   return dense_out;
 }
 
 template <typename T, typename Context>
-DenseTensor EmptyLike(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      DataType dtype = DataType::UNDEFINED) {
+DenseTensor EmptyLike(const Context& dev_ctx, const DenseTensor& x) {
   auto dense_out = Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
+  DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
   EmptyLikeKernel<T, Context>(dev_ctx, x, dtype, &dense_out);
   return dense_out;
diff --git a/paddle/phi/kernels/erfinv_grad_kernel.h b/paddle/phi/kernels/erfinv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..67e70ad38caf4f74864500757b1f733188dbbc86
--- /dev/null
+++ b/paddle/phi/kernels/erfinv_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfinvGradKernel(const Context& ctx,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/erfinv_kernel.h b/paddle/phi/kernels/erfinv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8380a62971ba452ed86ad9d993690c8e42afdd53
--- /dev/null
+++ b/paddle/phi/kernels/erfinv_kernel.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/expand_kernel.h b/paddle/phi/kernels/expand_kernel.h
index eb32ed24568599b2966f1f7772b8e9f6e710063b..fb5a0112ffcf7120314471db3c30b0e72a2b9c81 100644
--- a/paddle/phi/kernels/expand_kernel.h
+++ b/paddle/phi/kernels/expand_kernel.h
@@ -26,4 +26,4 @@ void ExpandKernel(const Context& ctx,
                   const ScalarArray& shape,
                   DenseTensor* out);
 
-}  // namepsace pten
+}  // namepsace phi
diff --git a/paddle/phi/kernels/eye_kernel.h b/paddle/phi/kernels/eye_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b21b8ae40562c979b23e4292a7591d9c6f10cf7
--- /dev/null
+++ b/paddle/phi/kernels/eye_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EyeKernel(const Context& ctx,
+               int64_t num_rows,
+               int64_t num_columns,
+               int dtype,
+               DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 7e8010a43f3d1898309ff72ab7189c58d4ece71d..f6ba2725004fe799f46ceebc26208f8adfda5047 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -26,8 +26,8 @@ void FlattenGradKernel(const Context& dev_ctx,
                        DenseTensor* x_grad) {
   auto xshape_dims = xshape.dims();
   auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-  phi::Copy(dev_ctx, out_grad, false, x_grad);
-  x_grad->ResizeAndAllocate(x_dims);
+  phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+  x_grad->Resize(x_dims);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index 12eaab92d5211c08143ba72058cd4443aca1501c..78ac9eaa785cd20c6087586892a9503ca4e24040 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -28,8 +28,8 @@ void FlattenKernel(const Context& dev_ctx,
                    int stop_axis,
                    DenseTensor* out) {
   auto out_dims = out->dims();
-  phi::Copy(dev_ctx, x, false, out);
-  out->ResizeAndAllocate(out_dims);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  out->Resize(out_dims);
 }
 
 // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index 394aab8f96e1ad1e8f2fb53ee4a163e7ec874226..c7b1f9af0e3191ec217d2907677ff34edebc551b 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -48,10 +48,10 @@ void FullLikeKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 DenseTensor Full(const Context& dev_ctx,
                  const ScalarArray& shape,
-                 const Scalar& val,
-                 DataType dtype = DataType::FLOAT32) {
+                 const Scalar& val) {
   auto dense_out = Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
+  DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
   FullKernel<T, Context>(dev_ctx, shape, val, dtype, &dense_out);
   return dense_out;
@@ -60,10 +60,10 @@ DenseTensor Full(const Context& dev_ctx,
 template <typename T, typename Context>
 DenseTensor FullLike(const Context& dev_ctx,
                      const DenseTensor& x,
-                     const Scalar& val,
-                     DataType dtype = DataType::UNDEFINED) {
+                     const Scalar& val) {
   auto dense_out = Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
+  DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
   FullLikeKernel<T, Context>(dev_ctx, x, val, dtype, &dense_out);
   return dense_out;
diff --git a/paddle/phi/kernels/funcs/axis_utils.h b/paddle/phi/kernels/funcs/axis_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..02a89471889a7abdda0e9856bf8c8d006895910d
--- /dev/null
+++ b/paddle/phi/kernels/funcs/axis_utils.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeOutAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index 4d7700a89d27bb66e741b1e38207d5bd3a797658..2868aa5acb75e37110f02cf30e761625a3cc8ff7 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -76,6 +76,36 @@ struct CBlas<phi::dtype::bfloat16> {
         "Blas VCOPY do not supported on CPU with bfloat16,"
         " please check your code"));
   }
+
+  template <typename... ARGS>
+  static void VADD(int n,
+                   const phi::dtype::bfloat16 *x,
+                   const phi::dtype::bfloat16 *y,
+                   phi::dtype::bfloat16 *z) {
+    for (int i = 0; i < n; ++i) {
+      z[i] = x[i] + y[i];
+    }
+  }
+
+  template <typename... ARGS>
+  static void VMUL(int n,
+                   const phi::dtype::bfloat16 *x,
+                   const phi::dtype::bfloat16 *y,
+                   phi::dtype::bfloat16 *z) {
+    for (int i = 0; i < n; ++i) {
+      z[i] = x[i] * y[i];
+    }
+  }
+
+  template <typename... ARGS>
+  static void VSUB(int n,
+                   const phi::dtype::bfloat16 *x,
+                   const phi::dtype::bfloat16 *y,
+                   phi::dtype::bfloat16 *z) {
+    for (int i = 0; i < n; ++i) {
+      z[i] = x[i] - y[i];
+    }
+  }
 };
 
 #ifdef PADDLE_WITH_MKLML
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cc b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
index c8405703a5c16ae9eae583638d1c89c22a736531..aa73ba5f689906e73f3f0e3a845aa397ad0a33c1 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
@@ -12,21 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/core/utils/data_type.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 2abfdb606e7e6c410f6f9deb45aed536bea88207..840c8872f50f83c2859f07be2e0e7242a74004a7 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -12,23 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.h b/paddle/phi/kernels/funcs/concat_and_split_functor.h
index 3af4d878d3cab03eb80a6ba878cc4fa5a62103c9..4cb15fe539b66b8a6fddccf18d92b95976db2a65 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.h
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.h
@@ -13,20 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <cmath>
-#include <memory>
+
 #include <vector>
 
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/utils/data_type.h"
 
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/concat_funcs.h b/paddle/phi/kernels/funcs/concat_funcs.h
index 32237e2cc236657db5a99fdd64392da4ff900562..70e3545b981fa7841f56a5a9ec2a9d4890b17d79 100644
--- a/paddle/phi/kernels/funcs/concat_funcs.h
+++ b/paddle/phi/kernels/funcs/concat_funcs.h
@@ -92,4 +92,4 @@ static inline phi::DDim ComputeAndCheckShape(
 }
 
 }  // namespace funcs
-}  // namespace  phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diag_functor.h b/paddle/phi/kernels/funcs/diag_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..a806d1583a0b363d44aa9f0cf3b3a64f4a8ea6ff
--- /dev/null
+++ b/paddle/phi/kernels/funcs/diag_functor.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+inline int ComputeStride(int axis, phi::DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index a82c4f66d010273f0f09fa71a38c3081fd1bc2ee..19a93970d090af060b888f512782975b073fff72 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -22,8 +22,8 @@
 
 #include <algorithm>
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 namespace funcs {
@@ -118,7 +118,7 @@ DenseTensor Diagonal(const DeviceContext& context,
 #endif
 
     // auto& dev_ctx = context.template device_context<DeviceContext>();
-    paddle::platform::ForRange<DeviceContext> for_range(context, diag.numel());
+    phi::funcs::ForRange<DeviceContext> for_range(context, diag.numel());
     DiagonalFunctor<T> functor(
         input_data, diag_arr, ret_arr, pos, dim_size, diag_data);
     for_range(functor);
diff --git a/paddle/phi/kernels/funcs/eigen/elementwise.cu b/paddle/phi/kernels/funcs/eigen/elementwise.cu
index 96d2ddba03c28df95331832ba2d4aa8e352d2f2b..3855ba8ccf94562f5c1b8ea2ca0e471fdb3f943d 100644
--- a/paddle/phi/kernels/funcs/eigen/elementwise.cu
+++ b/paddle/phi/kernels/funcs/eigen/elementwise.cu
@@ -55,5 +55,5 @@ struct EigenSub<Eigen::GpuDevice, T> {
 
 template struct EigenSub<Eigen::GpuDevice, float>;
 
-}  // namespace fucns
+}  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 47f1593a11eb9e29cc90b7db36650826734ac27f..d369781f845eb0887817f83be761b1027fc0bab0 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
@@ -418,7 +418,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
                                     DX_OP dx_op,
                                     DY_OP dy_op) {
   size_t N = static_cast<size_t>(phi::product(x_dim));
-  paddle::platform::ForRange<DeviceContext> for_range(dev_ctx, N);
+  phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
   for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
       x.data<T>(),
       y.data<T>(),
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf0888c301fe739994089b8e05357bd810455756
--- /dev/null
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context>
+struct ForRange {
+  ForRange(const Context& dev_ctx, size_t limit);
+
+  template <typename Function>
+  void operator()(Function func) const;
+};
+
+template <>
+struct ForRange<phi::CPUContext> {
+  ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    for (size_t i = 0; i < limit_; ++i) {
+      func(i);
+    }
+  }
+
+  size_t limit_;
+};
+
+// NOTE: After the pten kernel is migrated, it needs to be deleted.
+template <>
+struct ForRange<paddle::platform::CPUDeviceContext> {
+  ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx_, limit_);
+    for_range(func);
+  }
+
+  const paddle::platform::CPUDeviceContext& dev_ctx_;
+  size_t limit_;
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
+  size_t idx = static_cast<size_t>(threadIdx.x);
+  func(idx);
+}
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < limit) {
+    func(idx);
+  }
+}
+
+template <>
+struct ForRange<phi::GPUContext> {
+  ForRange(const phi::GPUContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(limit) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+#ifdef __HIPCC__
+    // HIP will throw core dump when threads > 256
+    constexpr int num_threads = 256;
+#elif WITH_NV_JETSON
+    // JETSON_NANO will throw core dump when threads > 128
+    int num_thread = 256;
+    backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128);
+    const int num_threads = num_thread;
+#else
+    constexpr int num_threads = 1024;
+#endif
+    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const phi::GPUContext& dev_ctx_;
+  size_t limit_;
+};
+
+// NOTE: After the pten kernel is migrated, it needs to be deleted.
+template <>
+struct ForRange<paddle::platform::CUDADeviceContext> {
+  ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(limit) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+    phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx_, limit_);
+    for_range(func);
+  }
+
+  const paddle::platform::CUDADeviceContext& dev_ctx_;
+  size_t limit_;
+};
+
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/functors.h b/paddle/phi/kernels/funcs/functors.h
index 5657bb047d7aa3a9b0f65d845d03e04c5b3636ae..d518a877b26f2c3d295eb0ceda8d4b862006e633 100644
--- a/paddle/phi/kernels/funcs/functors.h
+++ b/paddle/phi/kernels/funcs/functors.h
@@ -38,12 +38,15 @@ struct AddGradFunctor {
 
 template <typename T>
 struct ScaleFunctor {
-  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
+  using MT = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  explicit ScaleFunctor(const MT coeff) : coeff_(coeff) {}
 
-  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
+  inline HOSTDEVICE T operator()(T ele) {
+    return static_cast<T>(static_cast<MT>(ele) * coeff_);
+  }
 
  private:
-  T coeff_;
+  MT coeff_;
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/multinomial_op.h b/paddle/phi/kernels/funcs/multinomial_functor.h
similarity index 58%
rename from paddle/fluid/operators/multinomial_op.h
rename to paddle/phi/kernels/funcs/multinomial_functor.h
index 077e0e0ffa57e39af5bb0420357ccf9a1298f473..05a5a0faf6774650facc082d9a04a46866e61db5 100644
--- a/paddle/fluid/operators/multinomial_op.h
+++ b/paddle/phi/kernels/funcs/multinomial_functor.h
@@ -1,10 +1,10 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+     http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,30 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <vector>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-/**
- * Samples a multinomial distribution given a probability input
- */
-
-template <typename T>
-void MultinomialFunctor(int64_t* out_data, const T* in_data,
-                        const int64_t num_samples, const bool replacement,
+
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T, typename Context>
+void MultinomialFunctor(const Context& dev_ctx,
+                        int64_t* out_data,
+                        const T* in_data,
+                        const int64_t num_samples,
+                        const bool replacement,
                         const int64_t num_categories,
                         const int64_t num_distributions) {
   std::vector<T> cumulative_probs(num_categories);
 
   std::uniform_real_distribution<T> dist(0, 1);
-  auto gen_ptr = framework::DefaultCPUGenerator();
-  auto engine = gen_ptr->GetCPUEngine();
+  auto engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
 
   for (int64_t i = 0; i < num_distributions; i++) {
     T probs_sum = 0;
@@ -44,11 +39,12 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
     int64_t num_zeros = 0;
     for (int64_t j = 0; j < num_categories; j++) {
       prob_value = in_data[i * num_categories + j];
-      PADDLE_ENFORCE_GE(prob_value, 0.0,
-                        platform::errors::InvalidArgument(
-                            "The input of multinomial distribution "
-                            "should be >= 0, but got %f.",
-                            prob_value));
+      PADDLE_ENFORCE_GE(
+          prob_value,
+          0.0,
+          errors::InvalidArgument("The input of multinomial distribution "
+                                  "should be >= 0, but got %f.",
+                                  prob_value));
 
       probs_sum += prob_value;
       if (prob_value == 0) {
@@ -56,17 +52,18 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
       }
       cumulative_probs[j] = probs_sum;
     }
-    PADDLE_ENFORCE_GT(probs_sum, 0.0,
-                      platform::errors::InvalidArgument(
-                          "The sum of one multinomial distribution "
-                          "probability should be > 0, but got %f.",
-                          probs_sum));
+    PADDLE_ENFORCE_GT(
+        probs_sum,
+        0.0,
+        errors::InvalidArgument("The sum of one multinomial distribution "
+                                "probability should be > 0, but got %f.",
+                                probs_sum));
     PADDLE_ENFORCE_EQ(
-        (replacement || (num_categories - num_zeros >= num_samples)), true,
-        platform::errors::InvalidArgument(
-            "When replacement is False, number of "
-            "samples should be less than non-zero "
-            "categories."));
+        (replacement || (num_categories - num_zeros >= num_samples)),
+        true,
+        errors::InvalidArgument("When replacement is False, number of "
+                                "samples should be less than non-zero "
+                                "categories."));
 
     for (int64_t j = 0; j < num_categories; j++) {
       cumulative_probs[j] /= probs_sum;
@@ -121,8 +118,5 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
   }
 }
 
-template <typename DeviceContext, typename T>
-class MultinomialOpKernel;
-
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..65978da1374e4888afe8a7b408b0bb5a70d92b66
--- /dev/null
+++ b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    addmm_grad, GPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/addmm_kernel.cu b/paddle/phi/kernels/gpu/addmm_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b589ce20acca5c6cf51fd16ea223ef6b0d17466
--- /dev/null
+++ b/paddle/phi/kernels/gpu/addmm_kernel.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(addmm, GPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f4f69ee83eea14d31d67b600480ff5b1e5bad338
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(bilinear_tensor_product_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BilinearTensorProductGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b81b842cedba232d25059876e06dd47479d513d6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bilinear_tensor_product_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(bilinear_tensor_product,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BilinearTensorProductKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9165e8ea4147ff02bfe8a84d8fc24e92a6826025
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
@@ -0,0 +1,22 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    cholesky_grad, GPU, ALL_LAYOUT, phi::CholeskyGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..22ea87d83e8db924b81d75d81820ec5e4c3ed782
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -0,0 +1,217 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/cholesky_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+struct MatrixBandPartFunctor {
+  /*! Set output as input value outside a central band and 0 inside that band.
+   * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n]
+   * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper
+   * < 0 || (n-m) <= num_upper)
+   */
+  MatrixBandPartFunctor(const int m,
+                        const int n,
+                        const int num_lower_diags,
+                        const int num_upper_diags,
+                        const T* input,
+                        T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = static_cast<T>(0);
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const T* input_;
+  T* output_;
+};
+
+#define FUNC_WITH_TYPES(m) m(float, S) m(double, D)
+
+#define POTRF_INSTANCE(T, C)                                             \
+  void Potrf(const GPUContext& dev_ctx,                                  \
+             cublasFillMode_t uplo,                                      \
+             int n,                                                      \
+             T* A,                                                       \
+             int lda,                                                    \
+             int* info) {                                                \
+    auto handle = dev_ctx.cusolver_dn_handle();                          \
+    int workspace_size = 0;                                              \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
+        handle, uplo, n, A, lda, &workspace_size));                      \
+    auto workspace = paddle::memory::Alloc(dev_ctx, workspace_size);     \
+    T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());           \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf(            \
+        handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));  \
+  }
+
+FUNC_WITH_TYPES(POTRF_INSTANCE);
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+#define POTRF_BATCH_INSTANCE(T, C)                                   \
+  void PotrfBatched(const GPUContext& dev_ctx,                       \
+                    cublasFillMode_t uplo,                           \
+                    int n,                                           \
+                    T* Aarray[],                                     \
+                    int lda,                                         \
+                    int* info_array,                                 \
+                    int batch_size) {                                \
+    auto handle = dev_ctx.cusolver_dn_handle();                      \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrfBatched( \
+        handle, uplo, n, Aarray, lda, info_array, batch_size));      \
+  }
+
+FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
+#endif
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out) {
+  auto& dims = x.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  int m = dims[dims.size() - 1];
+  int tensor_size = batch_count * m * m;
+
+  const auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+
+  // matrices are assumed to be stored in column-major order in cusolver
+  cublasFillMode_t uplo =
+      upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  // portf is inplace, thus copy the triangular part of the input matrices to
+  // the output and set the other triangular part to 0 firstly
+  paddle::platform::ForRange<GPUContext> for_range(dev_ctx, tensor_size);
+  if (upper) {
+    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                      m,
+                                                      /* num_lower_diags */ 0,
+                                                      /* num_upper_diags */ m,
+                                                      x_data,
+                                                      out_data);
+    for_range(matrix_band_part_functor);
+  } else {
+    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                      m,
+                                                      /* num_lower_diags */ m,
+                                                      /* num_upper_diags */ 0,
+                                                      x_data,
+                                                      out_data);
+    for_range(matrix_band_part_functor);
+  }
+
+  auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_count);
+  auto* info_ptr = reinterpret_cast<int*>(info->ptr());
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+  if (batch_count > 1) {
+    std::vector<T*> output_ptrs;
+    for (int i = 0; i < batch_count; i++) {
+      output_ptrs.emplace_back(out_data + i * m * m);
+    }
+    thrust::device_vector<T*> dev_output_ptrs(output_ptrs.begin(),
+                                              output_ptrs.end());
+    PotrfBatched(dev_ctx,
+                 uplo,
+                 m,
+                 thrust::raw_pointer_cast(dev_output_ptrs.data()),
+                 m,
+                 info_ptr,
+                 batch_count);
+    // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need
+    // to clear the upper triangle of the output. Remove this workaround once
+    // the bug is fixed.
+    if (!upper) {
+      MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                        m,
+                                                        /* num_lower_diags */ m,
+                                                        /* num_upper_diags */ 0,
+                                                        out_data,
+                                                        out_data);
+      for_range(matrix_band_part_functor);
+    }
+  } else {
+#endif
+    for (int i = 0; i < batch_count; i++) {
+      Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i);
+    }
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+  }
+#endif
+  // check the info
+  std::vector<int> error_info;  // only for checking positive matrix
+  error_info.resize(batch_count);
+
+  paddle::memory::Copy(CPUPlace(),
+                       error_info.data(),
+                       dev_ctx.GetPlace(),
+                       info_ptr,
+                       sizeof(int) * batch_count,
+                       dev_ctx.stream());
+
+  for (int i = 0; i < batch_count; ++i) {
+    PADDLE_ENFORCE_EQ(error_info[i],
+                      0,
+                      errors::PreconditionNotMet(
+                          "For batch [%d]: U(%d, %d) is zero, singular U.",
+                          i,
+                          error_info[i],
+                          error_info[i]));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cholesky,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskyKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index 0cbf5525d60f53aa47ce58bc217e8ce75b399c14..4545f9ce436ea4028d43d3a91ae46a21cde41bb5 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -28,11 +28,11 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
           const DenseTensor& src,
+          Place dst_place,
           bool blocking,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
   const auto& src_place = src.place();
-  auto dst_place = dst->place();
 
   if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
     PADDLE_THROW(phi::errors::InvalidArgument(
@@ -43,8 +43,14 @@ void Copy(const Context& dev_ctx,
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
 
-  dst->ResizeAndAllocate(src.dims());
-  auto* dst_ptr = dst->mutable_data(dst_place);
+  dst->Resize(src.dims());
+
+  void* dst_ptr = nullptr;
+  if (paddle::platform::is_cpu_place(dst_place)) {
+    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+  } else {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  }
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
@@ -57,17 +63,8 @@ void Copy(const Context& dev_ctx,
 
   auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
 
-  if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
-      paddle::platform::is_cuda_pinned_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
+  if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+      paddle::platform::is_cpu_place(dst_place)) {
     auto src_gpu_place = src_place;
     auto dst_cpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
@@ -114,56 +111,6 @@ void Copy(const Context& dev_ctx,
                  : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
-    auto src_gpu_place = src_place;
-    auto dst_cuda_pinned_place = dst_place;
-    auto ctx_place = dev_ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
-                      true,
-                      phi::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from GPU memory to CUDA Pinned memory, current "
-                          "device context place should be GPU."));
-    auto ctx_gpu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(src_gpu_place,
-                      ctx_gpu_place,
-                      phi::errors::PreconditionNotMet(
-                          "The source GPU device and current device context do "
-                          "not match. The source GPU device number is %d, but "
-                          "device context GPU number is %d.",
-                          src_gpu_place.device,
-                          ctx_gpu_place.device));
-    auto stream =
-        blocking ? nullptr
-                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
-        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
-             paddle::platform::is_gpu_place(dst_place)) {
-    auto src_cuda_pinned_place = src_place;
-    auto dst_gpu_place = dst_place;
-    auto ctx_place = dev_ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
-                      true,
-                      phi::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from CUDA Pinned memory to GPU memory, current "
-                          "device context place should be GPU."));
-    auto ctx_gpu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(dst_gpu_place,
-                      ctx_gpu_place,
-                      phi::errors::PreconditionNotMet(
-                          "The target GPU device and current device context do "
-                          "not match. The target GPU device number is %d, but "
-                          "device context GPU number is %d.",
-                          dst_gpu_place.device,
-                          ctx_gpu_place.device));
-    auto stream =
-        blocking ? nullptr
-                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
-        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
   } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
              paddle::platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = src_place;
diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fc70639787173d84b69262245dbb0500aa179a90
--- /dev/null
+++ b/paddle/phi/kernels/gpu/diag_kernel.cu
@@ -0,0 +1,134 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_kernel.h"
+
+#include <algorithm>
+#include <tuple>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+// Extract the diagonal of a matrix 'x' to a vector 'out'.
+template <typename T>
+__global__ void ExtractDiagonalKernel(T* out,
+                                      const T* x,
+                                      std::ptrdiff_t start,
+                                      std::ptrdiff_t size,
+                                      const std::ptrdiff_t sumStride,
+                                      const std::ptrdiff_t outStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t xOffset = start + sumStride * idx;
+    out[outStride * idx] = x[xOffset];
+  }
+}
+
+// Paste a vector 'x' to the diagonal of a matrix 'out'
+template <typename T>
+__global__ void PasteDiagonalKernel(T* out,
+                                    const T* x,
+                                    std::ptrdiff_t start,
+                                    std::ptrdiff_t x_length,
+                                    const std::ptrdiff_t sumStride,
+                                    const std::ptrdiff_t xStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < x_length;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t outOffset = start + sumStride * idx;
+    out[outOffset] = x[xStride * idx];
+  }
+}
+
+template <typename T, typename Context>
+void DiagKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int offset,
+                float padding_value,
+                DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  auto x_dims = x.dims();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  auto out_dims = out->dims();
+
+  auto GetBlockGridSize = [&dev_ctx](int64_t size) {
+    const int64_t block_size =
+        std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
+    int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (size + block_size - 1) / block_size);
+    return std::tuple<int64_t, int64_t>{block_size, grid_size};
+  };
+
+  if (x_dims.size() == 1) {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+    auto x_length = x_dims[0];
+    auto size = (offset > 0) ? x_length + offset : x_length - offset;
+    const int& x_stride = phi::funcs::ComputeStride(0, x_dims);
+    if (size > 0) {
+      const auto& out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
+      const auto& out_stride_1 = phi::funcs::ComputeStride(1, out_dims);
+      auto start =
+          (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+      PasteDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                               std::get<0>(block_grid_size),
+                               0,
+                               dev_ctx.stream()>>>(out_data,
+                                                   x_data,
+                                                   start,
+                                                   x_length,
+                                                   out_stride_0 + out_stride_1,
+                                                   x_stride);
+    }
+  } else {
+    const int& x_stride_0 = phi::funcs::ComputeStride(0, x_dims);
+    const int& x_stride_1 = phi::funcs::ComputeStride(1, x_dims);
+
+    int64_t size;
+    if (offset > 0) {
+      size = std::min(x_dims[0], x_dims[1] - offset);
+    } else {
+      size = std::min(x_dims[0] + offset, x_dims[1]);
+    }
+
+    if (size > 0) {
+      auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+      const auto& out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
+
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+      ExtractDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                                 std::get<0>(block_grid_size),
+                                 0,
+                                 dev_ctx.stream()>>>(
+          out_data, x_data, start, size, x_stride_0 + x_stride_1, out_stride_0);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    diag, GPU, ALL_LAYOUT, phi::DiagKernel, int, int64_t, float, double) {}
diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/gpu/elementwise.h
index 369bd8d8ad41832562158c03095ac601d5a822ce..12cafc7023bb5100d5f619aeec29a357a13e4935 100644
--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/gpu/elementwise.h
@@ -1460,7 +1460,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
     auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
     if (dx->dims() == dout.dims()) {
       if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, false, dx);
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
       }
     } else {
       // For inplace strategy, dx will be stored in addr of dout, which makes
@@ -1481,7 +1481,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
     auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
     if (dy->dims() == dout.dims()) {
       if (dy_data != dout_data) {
-        phi::Copy(ctx, dout, false, dy);
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
       }
     } else {
       std::vector<int> reduce_dims =
@@ -1507,11 +1507,11 @@ void elementwise_add_grad(const GPUContext &ctx,
   if (dx_data == dout_data && dy_data != dout_data) {
     VLOG(4) << "Special case when dx_data is the same as dout_data, "
                "only need copy dout to dy";
-    phi::Copy(ctx, dout, false, dy);
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
   } else if (dx_data != dout_data && dy_data == dout_data) {
     VLOG(4) << "Special case when dy_data is the same as dout_data, "
                "only need copy dout to dx";
-    phi::Copy(ctx, dout, false, dx);
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
   } else if (dx_data != dout_data && dy_data != dout_data) {
     auto size = x.numel();
     int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
@@ -1571,7 +1571,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
     auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
     if (dx->dims() == dout.dims()) {
       if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, false, dx);
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
       }
     } else {
       // For inplace strategy, dx will be stored in addr of dout, which makes
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 02dbb506c4eb579fbb2b82513421aaf1dd3ef163..3c4c01b1dc8ff739ac87ca2e9fe7a6659ab4eac3 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -128,6 +128,7 @@ PD_REGISTER_KERNEL(add_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -140,6 +141,7 @@ PD_REGISTER_KERNEL(add_double_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -152,6 +154,7 @@ PD_REGISTER_KERNEL(add_triple_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -164,6 +167,7 @@ PD_REGISTER_KERNEL(subtract_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -176,5 +180,6 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..50fbfddf0432e38068dd1fe529a6b26f4fdd788b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/erfinv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    erfinv_grad, GPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/erfinv_kernel.cu b/paddle/phi/kernels/gpu/erfinv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..10df0bdf5603c927dba7631e07096ac9cf2aeb50
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erfinv_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/erfinv_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(erfinv, GPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
diff --git a/paddle/fluid/operators/erfinv_op.cu b/paddle/phi/kernels/gpu/eye_kernel.cu
similarity index 52%
rename from paddle/fluid/operators/erfinv_op.cu
rename to paddle/phi/kernels/gpu/eye_kernel.cu
index 1fb2dbb97a2df60783bb84b88816823ec1afd9a2..069310b0d156271079ab76ddfb4d8ff88400be78 100644
--- a/paddle/fluid/operators/erfinv_op.cu
+++ b/paddle/phi/kernels/gpu/eye_kernel.cu
@@ -12,17 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/erfinv_op.h"
+#include "paddle/phi/kernels/eye_kernel.h"
+#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    erfinv,
-    paddle::operators::ErfinvKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ErfinvKernel<paddle::platform::CUDADeviceContext,
-                                    double>);
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    erfinv_grad,
-    paddle::operators::ErfinvGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    paddle::operators::ErfinvGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>);
+PD_REGISTER_KERNEL(eye,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EyeKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 48b26540331ef54c971d3f0cf5ee9ef9fc185f1d..1f756bfdbed30a18aebfabfb0810436406c87204 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -41,7 +41,7 @@ void FullKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   out->Resize(phi::make_ddim(shape.GetData()));
   int numel = out->numel();
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   if (numel > 0) {
     // in transformer model the numel of outpout will be zero.
     std::vector<const DenseTensor*> inputs = {};
@@ -85,7 +85,7 @@ void FullLikeKernel(const Context& dev_ctx,
           static_cast<float>(value)));
   std::vector<const DenseTensor*> inputs = {};
   std::vector<DenseTensor*> outputs = {out};
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   // This function has no input, so the inputs.size() == 0. Use kUnary, but the
   // data will not be loaded in the kernel because the number of parameters in
   // the operator is 0
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a28a7512f49862d21c83029bb05ff13bacdb995e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(gumbel_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GumbelSoftmaxGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b1e58981baa0a4768057b5a1c072d4182dfc1fd
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -0,0 +1,181 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gumbel_softmax_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+template <typename T>
+struct UniformCUDAGenerator {
+  T min_, max_;
+  unsigned int seed_;
+  unsigned int offset_ = 0;
+  HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed)
+      : min_(min), max_(max), seed_(seed) {}
+  HOSTDEVICE UniformCUDAGenerator(T min,
+                                  T max,
+                                  unsigned int seed,
+                                  unsigned int offset)
+      : min_(min), max_(max), seed_(seed), offset_(offset) {}
+
+  HOSTDEVICE T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    return dist(rng);
+  }
+};
+
+template <typename T, size_t BlockDim>
+__global__ void OneHotCUDAKernel(const int64_t height,
+                                 const int64_t width,
+                                 const int64_t size_out_axis,
+                                 const T init,
+                                 const T* in,
+                                 T* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int64_t idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / size_out_axis;
+    int w = idx % size_out_axis;
+    cub::ArgMax reducer;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair = reducer(
+          {k, in[h * width * size_out_axis + k * size_out_axis + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      int index = static_cast<int>(kv_pair.key);
+      out[h * width * size_out_axis + index * size_out_axis + w] = 1;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T>
+struct OneHotGenerator<GPUContext, T> {
+  static void Transform(const GPUContext& ctx,
+                        const DenseTensor& X,
+                        DenseTensor* out,
+                        int axis) {
+    const int size_to_axis = funcs::SizeToAxis(axis, X.dims());
+    const int size_from_axis = funcs::SizeFromAxis(axis, X.dims());
+    const int size_out_axis = funcs::SizeOutAxis(axis, X.dims());
+    constexpr int thread_size = 512;
+    int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+    int64_t height = size_to_axis * size_out_axis;
+    int block_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+    DenseTensor input_tensor;
+    input_tensor.Resize(out->dims());
+    ctx.template Alloc<T>(&input_tensor);
+    paddle::framework::TensorCopy(*out, ctx.GetPlace(), &input_tensor);
+    funcs::set_constant(ctx, out, 0.0);
+    OneHotCUDAKernel<T,
+                     thread_size><<<block_size, thread_size, 0, ctx.stream()>>>(
+        height,
+        size_from_axis / size_out_axis,
+        size_out_axis,
+        std::numeric_limits<T>::lowest(),
+        input_tensor.data<T>(),
+        out->data<T>());
+  }
+};
+
+template <typename T>
+__global__ void AddGumbelNoiseCUDAKernel(const T* input_data,
+                                         T* output_data,
+                                         T* noise,
+                                         const float temperature,
+                                         int64_t n) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int step = blockDim.x * gridDim.x;
+  for (int64_t i = index; i < n; i += step) {
+    T gumbel_noise = -log(-log(noise[i]));
+    output_data[i] = (gumbel_noise + input_data[i]) / temperature;
+  }
+}
+
+template <typename T>
+struct GumbleNoiseGenerator<GPUContext, T> {
+  static void Transform(const GPUContext& ctx,
+                        const T* input_data,
+                        T* output_data,
+                        int size_to_axis,
+                        int size_from_axis,
+                        const float temperature) {
+    DenseTensor random_tensor;
+    int64_t size = size_to_axis * size_from_axis;
+    random_tensor.Resize(make_ddim({size}));
+    auto* random_data = ctx.template Alloc<T>(&random_tensor);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+
+    // generate gumbel noise
+    int device_id = ctx.GetPlace().GetDeviceId();
+    auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+    if (gen_cuda->GetIsInitPy()) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin,
+          index_sequence_begin + size,
+          thrust::device_ptr<T>(random_data),
+          UniformCUDAGenerator<T>(0.00001, 1, seed_offset.first, gen_offset));
+    } else {
+      const unsigned int seed = std::random_device()();
+      thrust::transform(index_sequence_begin,
+                        index_sequence_begin + size,
+                        thrust::device_ptr<T>(random_data),
+                        UniformCUDAGenerator<T>(0.00001, 1, seed));
+    }
+
+    // add gumbel noise to X
+    const int thread_size = 512;
+    int64_t block_size = (size + thread_size) / thread_size;
+    AddGumbelNoiseCUDAKernel<T><<<block_size, thread_size, 0, ctx.stream()>>>(
+        input_data, output_data, random_data, temperature, size);
+  }
+};
+
+}  // namespace phi
+#endif
+
+PD_REGISTER_KERNEL(
+    gumbel_softmax, GPU, ALL_LAYOUT, phi::GumbelSoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/increment_kernel.cu b/paddle/phi/kernels/gpu/increment_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b3c31271911489c94e895dbea786e4bf61f56bb4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/increment_kernel.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/increment_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/increment_kernel_impl.h"
+
+PD_REGISTER_KERNEL(increment,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IncrementKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index 02e3f00bd3425b6dd6f3fe02a4eabf59aaca99ea..56e8b16ccbe0df16fdc96470a8167e6dc6abfb3c 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -106,6 +106,7 @@ PD_REGISTER_KERNEL(add_raw,
                    int,
                    int64_t,
                    float16,
+                   bfloat16,
                    complex64,
                    complex128) {}
 PD_REGISTER_KERNEL(subtract_raw,
@@ -118,6 +119,7 @@ PD_REGISTER_KERNEL(subtract_raw,
                    int,
                    int64_t,
                    float16,
+                   bfloat16,
                    complex64,
                    complex128) {}
 PD_REGISTER_KERNEL(divide_raw,
@@ -143,7 +145,8 @@ PD_REGISTER_KERNEL(multiply_raw,
                    bool,
                    float16,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   bfloat16) {}
 PD_REGISTER_KERNEL(sum_raw,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4918495ff7bed83d8fee7e811017927b53faf5f9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -0,0 +1,290 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// To-do(qili93): fix this after issue resolved
+// https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202
+
+#include "paddle/phi/kernels/multinomial_kernel.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/multinomial_functor.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void NormalizeProbability(T* norm_probs,
+                                     const T* in_data,
+                                     T* sum_rows,
+                                     int64_t num_distributions,
+                                     int64_t num_categories) {
+  int id = threadIdx.x + blockIdx.x * blockDim.x +
+           blockIdx.y * gridDim.x * blockDim.x;
+  if (id < num_distributions * num_categories) {
+    PADDLE_ENFORCE(
+        in_data[id] >= 0.0,
+        "The input of multinomial distribution should be >= 0, but got %f.",
+        in_data[id]);
+    int64_t row_id = id / num_categories;
+    PADDLE_ENFORCE(sum_rows[row_id] > 0.0,
+                   "The sum of one multinomial distribution probability should "
+                   "be > 0, but got %f.",
+                   sum_rows[row_id]);
+    norm_probs[id] = in_data[id] / sum_rows[row_id];
+  }
+}
+
+template <typename T>
+__global__ void GetCumulativeProbs(T* norm_probs_data,
+                                   int64_t num_distributions,
+                                   int64_t num_categories,
+                                   T* cumulative_probs) {
+  int id = blockIdx.x;
+  thrust::inclusive_scan(thrust::device,
+                         norm_probs_data + id * num_categories,
+                         norm_probs_data + (id + 1) * num_categories,
+                         cumulative_probs + id * num_categories);
+}
+
+template <typename T>
+struct RandomGeneratorCudaFunctor {
+  unsigned int seed_;
+  __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+template <typename T>
+__device__ int binarySearchFunctor(T* cumulative_probs,
+                                   T* norm_probs_data,
+                                   int num_categories,
+                                   T rng_number) {
+  int left = 0;
+  int right = num_categories;
+
+  while (right - left > 0) {
+    int mid = left + (right - left) / 2;
+
+    T temp_prob = cumulative_probs[mid];
+    if (temp_prob < rng_number) {
+      left = mid + 1;
+    } else {
+      right = mid;
+    }
+  }
+
+  if (left == num_categories) {
+    left = num_categories - 1;
+  }
+
+  while (left >= 1 && norm_probs_data[left] == 0) left--;
+
+  return left;
+}
+
+template <typename T>
+__global__ void sampleMultinomialWithReplacement(
+    T* rng_data,
+    const int64_t num_samples,
+    int64_t* out_data,
+    const int64_t num_distributions,
+    const int64_t num_categories,
+    T* cumulative_probs,
+    T* norm_probs_data) {
+  // use binary search to get the selected category sample id.
+  // let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
+
+  // for every distribution
+  int dist = blockIdx.y;
+  // for every sample
+  int sample = blockIdx.x * blockDim.x + threadIdx.x;
+  if (sample < num_samples) {
+    T rng_number = rng_data[sample + dist * num_samples];
+
+    // Find the bucket that a uniform random number lies in
+    int selected_category =
+        binarySearchFunctor<T>(cumulative_probs + dist * num_categories,
+                               norm_probs_data + dist * num_categories,
+                               num_categories,
+                               rng_number);
+
+    out_data[sample + dist * num_samples] = selected_category;
+  }
+}
+
+template <typename T, typename Context>
+void MultinomialKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int num_samples,
+                       bool replacement,
+                       DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  auto in_dims = x.dims();
+  int64_t in_rank = in_dims.size();
+  const int64_t num_categories = in_dims[in_rank - 1];
+  const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
+
+  // If replacement is False, it's not a replaceable sample. Every category
+  // can
+  // be used only once. So after every sample, probability of the distribution
+  // will change. The implementation can't be parallelizable. Thus, call CPU
+  // implementation ``funcs::MultinomialFunctor`` to sample the distribution.
+  if (!replacement) {
+    int64_t in_data_numel = x.numel();
+    int64_t out_data_numel = out->numel();
+
+    T* cpu_in_data = new T[in_data_numel];
+    int64_t* cpu_out_data = new int64_t[out_data_numel];
+
+#ifdef PADDLE_WITH_HIP
+    hipMemcpy(
+        cpu_in_data, in_data, in_data_numel * sizeof(T), hipMemcpyDeviceToHost);
+#else
+    cudaMemcpy(cpu_in_data,
+               in_data,
+               in_data_numel * sizeof(T),
+               cudaMemcpyDeviceToHost);
+#endif
+
+    funcs::MultinomialFunctor<T>(dev_ctx,
+                                 cpu_out_data,
+                                 cpu_in_data,
+                                 num_samples,
+                                 replacement,
+                                 num_categories,
+                                 num_distributions);
+
+#ifdef PADDLE_WITH_HIP
+    hipMemcpy(out_data,
+              cpu_out_data,
+              out_data_numel * sizeof(int64_t),
+              hipMemcpyHostToDevice);
+#else
+    cudaMemcpy(out_data,
+               cpu_out_data,
+               out_data_numel * sizeof(int64_t),
+               cudaMemcpyHostToDevice);
+#endif
+
+    delete[] cpu_in_data;
+    delete[] cpu_out_data;
+    return;
+  }
+
+  // Sum of input may not be 1. To get probability in range [0, 1], calculate
+  // sum of each row of input, and then use the sum to normalize the input.
+  // sum_row_data: sum of each row
+  DenseTensor sum_rows_tensor;
+  sum_rows_tensor.Resize({num_distributions});
+  auto* sum_rows_data = dev_ctx.template Alloc<T>(&sum_rows_tensor);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  if (num_distributions == 1) {
+    auto eigen_input = EigenVector<T>::Flatten(x);
+    auto eigen_sum_rows = EigenVector<T>::Flatten(sum_rows_tensor);
+    eigen_sum_rows.device(place) =
+        eigen_input.sum(Eigen::DSizes<int, 1>(1))
+            .eval()
+            .reshape(Eigen::DSizes<int, 1>(sum_rows_tensor.dims()[0]));
+  } else {
+    auto eigen_input = EigenMatrix<T>::From(x);
+    auto eigen_sum_rows = EigenVector<T>::Flatten(sum_rows_tensor);
+    eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes<int, 1>(1));
+  }
+
+  // Normalize row of each distribution to get the probability in range [0,
+  // 1].
+  // norm_probs_data: probability of the distribution
+  DenseTensor norm_probs_tensor;
+  norm_probs_tensor.Resize({num_distributions, num_categories});
+  auto* norm_probs_data = dev_ctx.template Alloc<T>(&norm_probs_tensor);
+
+  // number of threads in a block is min(num_categories, 512)
+  dim3 block_norm(num_categories < 512 ? num_categories : 512);
+  dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
+  NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(
+      norm_probs_data,
+      in_data,
+      sum_rows_data,
+      num_distributions,
+      num_categories);
+
+  // Get cumulative probability of each distribution. It's the same function
+  // of
+  // ``cumsum`` op.
+  DenseTensor cumulative_probs_tensor;
+  cumulative_probs_tensor.Resize({num_distributions, num_categories});
+  auto* cumulative_probs = dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
+
+  dim3 block_cumsum(1);
+  dim3 grid_cumsum(num_distributions);
+  GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
+      norm_probs_data, num_distributions, num_categories, cumulative_probs);
+
+  // Generate random number for each sample.
+  std::random_device rd;
+  auto seed = rd();
+
+  DenseTensor rng_data_tensor;
+  rng_data_tensor.Resize({num_distributions, num_samples});
+  auto* rng_data = dev_ctx.template Alloc<T>(&rng_data_tensor);
+
+  thrust::counting_iterator<int64_t> index_sequence_begin(0);
+  paddle::platform::Transform<GPUContext> trans;
+  trans(dev_ctx,
+        index_sequence_begin,
+        index_sequence_begin + num_distributions * num_samples,
+        rng_data,
+        RandomGeneratorCudaFunctor<T>(seed));
+
+  // Sample the multinomial distributions.
+  dim3 block_sample(128);
+  dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions);
+  sampleMultinomialWithReplacement<
+      T><<<grid_sample, block_sample, 0, dev_ctx.stream()>>>(rng_data,
+                                                             num_samples,
+                                                             out_data,
+                                                             num_distributions,
+                                                             num_categories,
+                                                             cumulative_probs,
+                                                             norm_probs_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multinomial,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultinomialKernel,
+                   float,
+                   double) {}
+
+#endif
diff --git a/paddle/phi/kernels/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/gpu/mv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9eb8cd375ebd670907b556e7f2b8cf599d61643e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mv_grad_kernel.cu
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mv_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void MVGradDxCUDAKernel(
+    const int m, const int n, const T *dout, const T *vec, T *dx) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < m * n; idx += blockDim.x * gridDim.x) {
+    int i = idx / n;
+    int j = idx % n;
+    dx[idx] = dout[i] * vec[j];
+  }
+}
+
+template <typename T, typename Context>
+void MvGradKernel(const Context &dev_ctx,
+                  const DenseTensor &x,
+                  const DenseTensor &vec,
+                  const DenseTensor &out_grad,
+                  DenseTensor *x_grad,
+                  DenseTensor *vec_grad) {
+  auto dout = out_grad;
+  auto dx = x_grad;
+  auto dvec = vec_grad;
+
+  auto dim_x = x.dims();
+  int m = dim_x[0];
+  int n = dim_x[1];
+
+  // get data ptr
+  const T *x_data = x.data<T>();
+  const T *vec_data = vec.data<T>();
+  const T *dout_data = dout.data<T>();
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  auto stream = dev_ctx.stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n);
+
+  if (dx) {
+    T *dx_data = dev_ctx.template Alloc<T>(dx);
+
+    MVGradDxCUDAKernel<
+        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+        m, n, dout_data, vec_data, dx_data);
+  }
+
+  if (dvec) {
+    T *dvec_data = dev_ctx.template Alloc<T>(dvec);
+
+    blas.GEMV(true,
+              dim_x[0],
+              dim_x[1],
+              static_cast<T>(1),
+              x_data,
+              dout_data,
+              static_cast<T>(0),
+              dvec_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mv_grad, GPU, ALL_LAYOUT, phi::MvGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/mv_kernel.cu b/paddle/phi/kernels/gpu/mv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1faba5a62d2cd60cca054d10c9571339375d0468
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mv_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mv_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/impl/mv_kernel_impl.h"
+
+PD_REGISTER_KERNEL(mv, GPU, ALL_LAYOUT, phi::MvKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b82cbc67485b115eb7c4dda60375c8c0cdc3b04
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pixel_shuffle_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PixelShuffleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25b240c6c1a3bedbb2b93a3a2c32f3ba88afdb63
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_shuffle_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    pixel_shuffle, GPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu
index ae97f2fca68cb37445086065ed421f160b481235..347f70b166657622840fbd3cfb4e62aa1f87eb2a 100644
--- a/paddle/phi/kernels/gpu/poisson_kernel.cu
+++ b/paddle/phi/kernels/gpu/poisson_kernel.cu
@@ -19,9 +19,9 @@ limitations under the License. */
 #include <hiprand_kernel.h>
 #endif
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/poisson_kernel.h"
 
 namespace phi {
@@ -65,7 +65,7 @@ void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
   uint64_t seed = seed_offset.first;
   uint64_t offset = seed_offset.second;
 
-  paddle::platform::ForRange<Context> for_range(ctx, size);
+  phi::funcs::ForRange<Context> for_range(ctx, size);
 
   PoissonCudaFunctor<T> functor(x_data, out_data, seed, offset);
   for_range(functor);
diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66dc5f72a5c7067a08127bce65740851b123efd3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/randint_kernel.h"
+
+#include <random>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandintRawKernel(const Context& dev_ctx,
+                      int low,
+                      int high,
+                      const ScalarArray& shape,
+                      DataType dtype,
+                      int seed,
+                      DenseTensor* out) {
+  DenseTensor tmp;
+  tmp.Resize(phi::make_ddim(shape.GetData()));
+  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
+
+  out->Resize(tmp.dims());
+  T* data = dev_ctx.template Alloc<T>(out);
+
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+  }
+
+  std::uniform_int_distribution<T> dist(low, high - 1);
+  auto numel = out->numel();
+  for (int64_t i = 0; i < numel; ++i) {
+    tmp_data[i] = dist(*engine);
+  }
+
+  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+      out->place(),
+      data,
+      tmp.place(),
+      tmp_data,
+      numel * paddle::experimental::SizeOf(out->dtype()),
+      0);
+}
+
+template <typename T, typename Context>
+void RandintKernel(const Context& dev_ctx,
+                   int low,
+                   int high,
+                   const ScalarArray& shape,
+                   DataType dtype,
+                   DenseTensor* out) {
+  RandintRawKernel<T>(dev_ctx, low, high, shape, dtype, 0, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    randint_raw, GPU, ALL_LAYOUT, phi::RandintRawKernel, int, int64_t) {}
+
+PD_REGISTER_KERNEL(randint, GPU, ALL_LAYOUT, phi::RandintKernel, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index f75f768b633a31a9d3d6eadcf036640f50309a8b..d4d90cac917a2c35e26eca0d57d1c5349b878599 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -12,41 +12,60 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/randperm_kernel.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
 template <typename T, typename Context>
-void RandpermKernel(const Context& ctx,
-                    int n,
-                    DataType dtype,
-                    DenseTensor* out) {
+void RandpermRawKernel(
+    const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
   DenseTensor tmp;
   tmp.Resize(phi::make_ddim({n}));
-  T* tmp_data = ctx.template HostAlloc<T>(&tmp);
+  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
 
-  auto gen_ptr = ctx.GetHostGenerator();
-  auto engine = gen_ptr->GetCPUEngine();
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+  }
 
   for (int i = 0; i < n; ++i) {
     tmp_data[i] = static_cast<T>(i);
   }
   std::shuffle(tmp_data, tmp_data + n, *engine);
 
-  T* out_data = ctx.template Alloc<T>(out);
+  T* out_data = dev_ctx.template Alloc<T>(out);
   auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
   paddle::memory::Copy<phi::GPUPlace, phi::Place>(
       out->place(), out_data, tmp.place(), tmp_data, size, 0);
 }
 
+template <typename T, typename Context>
+void RandpermKernel(const Context& dev_ctx,
+                    int n,
+                    DataType dtype,
+                    DenseTensor* out) {
+  RandpermRawKernel<T>(dev_ctx, n, dtype, 0, out);
+}
+
 }  // namespace phi
 
+PD_REGISTER_KERNEL(randperm_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RandpermRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(randperm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..17a39944eb04f5cecd941b07e82fb6bb97363977
--- /dev/null
+++ b/paddle/phi/kernels/gpu/size_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/size_kernel_impl.h"
+#include "paddle/phi/kernels/size_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(size,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SizeKernel,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa496d3cd391b59bef16c57dc8b7f0c39834c107
--- /dev/null
+++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..32efb9b776419efe5733ab0493c38f9c1a9c237e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..12c1bf791e1691bb6eee81750b337adea713b794
--- /dev/null
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include <limits>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+// #include "paddle/phi/core/generator.h"
+
+namespace phi {
+
+template <typename T>
+struct GPUTruncatedNormal {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+
+  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
+      : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
+template <typename T>
+struct TruncatedNormalOffset {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+  int offset_;
+
+  __host__ __device__
+  TruncatedNormalOffset(T mean, T std, T numeric_min, int seed, int offset)
+      : mean(mean),
+        std(std),
+        seed(seed),
+        numeric_min(numeric_min),
+        offset_(offset) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n + offset_);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
+template <typename T, typename Context>
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const ScalarArray& shape,
+                                   float mean,
+                                   float std,
+                                   int seed,
+                                   DataType dtype,
+                                   DenseTensor* out) {
+  auto tensor = out;
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  thrust::counting_iterator<int64_t> index_sequence_begin(0);
+  int64_t size = tensor->numel();
+
+  int device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    auto seed_offset = gen_cuda->IncrementOffset(1);
+    int64_t gen_offset = size * seed_offset.second;
+    thrust::transform(index_sequence_begin,
+                      index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      TruncatedNormalOffset<T>(mean,
+                                               std,
+                                               std::numeric_limits<T>::min(),
+                                               seed_offset.first,
+                                               gen_offset));
+  } else {
+    thrust::transform(
+        index_sequence_begin,
+        index_sequence_begin + size,
+        thrust::device_ptr<T>(data),
+        GPUTruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(truncated_gaussian_random,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TruncatedGaussianRandomKernel,
+                   float) {}
diff --git a/paddle/phi/kernels/gpu/unbind_kernel.cu b/paddle/phi/kernels/gpu/unbind_kernel.cu
index 1efc3a1094da253c27fec5108b536837d868425e..8a7aa8f6033ab9b86f87e792bc37f912562578a7 100644
--- a/paddle/phi/kernels/gpu/unbind_kernel.cu
+++ b/paddle/phi/kernels/gpu/unbind_kernel.cu
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/unbind_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
-#include "paddle/phi/kernels/unbind_kernel.h"
 
 PD_REGISTER_KERNEL(unbind,
                    GPU,
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
similarity index 63%
rename from paddle/fluid/operators/softmax_cudnn_op.cu.h
rename to paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index dc5166f4f994f8f4af1da2dcd1f1d26de1f35ba2..45798b88bb58a3b088b2545f4a343c18ebec0ec4 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -14,18 +14,20 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using DataLayout = platform::DataLayout;
-using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = paddle::platform::ScopedTensorDescriptor;
+using GPUDNNDataLayout = paddle::platform::DataLayout;
 
 // Vectorization trait 4 * sizeof(T)
 template <typename T>
@@ -41,7 +43,7 @@ class VecT4<float> {
   using Type = int4;
 };
 template <>
-class VecT4<platform::float16> {
+class VecT4<phi::dtype::float16> {
  public:
   using Type = int2;
 };
@@ -60,7 +62,7 @@ class VecT2<float> {
   using Type = int2;
 };
 template <>
-class VecT2<platform::float16> {
+class VecT2<phi::dtype::float16> {
  public:
   using Type = int;
 };
@@ -77,7 +79,8 @@ __device__ __forceinline__ void WarpReduceSum(T* sum) {
   for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
 #pragma unroll
     for (int i = 0; i < BatchSize; ++i) {
-      T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      T sum_val =
+          paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
       sum[i] = sum[i] + sum_val;
     }
   }
@@ -89,14 +92,13 @@ __device__ __forceinline__ void WarpReduceMax(T* sum) {
   for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
 #pragma unroll
     for (int i = 0; i < BatchSize; ++i) {
-      T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      T max_val =
+          paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
       sum[i] = max(sum[i], max_val);
     }
   }
 }
 
-namespace kps = paddle::operators::kernel_primitives;
-
 template <typename Tx, typename Ty = Tx>
 struct ReduceMaxFunctor {
   inline Ty initial() { return -std::numeric_limits<Ty>::infinity(); }
@@ -248,10 +250,15 @@ One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
 For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
 api to compute max (sum) in one warp.
 */
-template <typename T, typename VecT, typename AccT, int Log2Elements,
+template <typename T,
+          typename VecT,
+          typename AccT,
+          int Log2Elements,
           bool LogMode = false>
-__global__ void WarpSoftmaxForward(T* softmax, const T* src,
-                                   const int batch_size, const int stride,
+__global__ void WarpSoftmaxForward(T* softmax,
+                                   const T* src,
+                                   const int batch_size,
+                                   const int stride,
                                    const int element_count) {
   constexpr int kDimCeil = 1 << Log2Elements;
   constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
@@ -302,9 +309,13 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src,
   }
 
   // compute max
-  kps::Reduce<AccT, kVItem, kBatchSize, 1, ReduceMaxFunctor<AccT>,
-              kMode::kLocalMode>(&max[0], &srcdata[0][0][0],
-                                 ReduceMaxFunctor<AccT>(), true);
+  kps::Reduce<AccT,
+              kVItem,
+              kBatchSize,
+              1,
+              ReduceMaxFunctor<AccT>,
+              kMode::kLocalMode>(
+      &max[0], &srcdata[0][0][0], ReduceMaxFunctor<AccT>(), true);
   WarpReduceMax<AccT, kBatchSize, kWarpSize>(max);
 
 // compute sum
@@ -313,9 +324,13 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src,
     kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpSubFunctor<AccT>>(
         &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor<AccT>(max[i]));
   }
-  kps::Reduce<AccT, kVItem, kBatchSize, 1, kps::AddFunctor<AccT>,
-              kMode::kLocalMode>(&sum[0], &srcdata[0][0][0],
-                                 kps::AddFunctor<AccT>(), true);
+  kps::Reduce<AccT,
+              kVItem,
+              kBatchSize,
+              1,
+              kps::AddFunctor<AccT>,
+              kMode::kLocalMode>(
+      &sum[0], &srcdata[0][0][0], kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
 // write data to global memory
@@ -340,10 +355,16 @@ One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
 For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
 api to compute max (sum) in one warp.
 */
-template <typename T, typename VecT, typename AccT, int Log2Elements,
+template <typename T,
+          typename VecT,
+          typename AccT,
+          int Log2Elements,
           bool LogMode = false>
-__global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
-                                    int batch_size, int stride,
+__global__ void WarpSoftmaxBackward(T* dst,
+                                    const T* grad,
+                                    const T* src,
+                                    int batch_size,
+                                    int stride,
                                     int element_count) {
   constexpr int kVSize = sizeof(VecT) / sizeof(T);
   constexpr int kDimCeil = 1 << Log2Elements;
@@ -403,7 +424,11 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
   AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[0][0][0]);
   kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
       &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
-  kps::Reduce<AccT, kVItem, kBatchSize, 1, kps::AddFunctor<AccT>,
+  kps::Reduce<AccT,
+              kVItem,
+              kBatchSize,
+              1,
+              kps::AddFunctor<AccT>,
               kps::details::ReduceMode::kLocalMode>(
       &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
@@ -429,7 +454,10 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
 
 #define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)                      \
   case Log2Elements:                                                       \
-    WarpSoftmaxForward<T, VecT, AccT, Log2Elements,                        \
+    WarpSoftmaxForward<T,                                                  \
+                       VecT,                                               \
+                       AccT,                                               \
+                       Log2Elements,                                       \
                        LogMode><<<blocks, threads, 0, dev_ctx.stream()>>>( \
         dst, src, batch_size, stride, element_count);                      \
     break;
@@ -438,12 +466,16 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
   Wrapper of softmax formward with template instantiation on size of input.
 */
 template <typename T, typename VecT, bool LogMode>
-void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads,
-                              const platform::CUDADeviceContext& dev_ctx,
-                              T* dst, const T* src, const int batch_size,
-                              const int stride, const int element_count,
+void SwitchWarpSoftmaxForward(const int blocks,
+                              const dim3 threads,
+                              const GPUContext& dev_ctx,
+                              T* dst,
+                              const T* src,
+                              const int batch_size,
+                              const int stride,
+                              const int element_count,
                               int Log2Elements) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   switch (Log2Elements) {
     SOFTMAX_WARP_FORWARD_CASE(0, AccT);
     SOFTMAX_WARP_FORWARD_CASE(1, AccT);
@@ -462,7 +494,10 @@ void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads,
 
 #define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT)                      \
   case Log2Elements:                                                        \
-    WarpSoftmaxBackward<T, VecT, AccT, Log2Elements,                        \
+    WarpSoftmaxBackward<T,                                                  \
+                        VecT,                                               \
+                        AccT,                                               \
+                        Log2Elements,                                       \
                         LogMode><<<blocks, threads, 0, dev_ctx.stream()>>>( \
         dst, grad, src, batch_size, stride, element_count);                 \
     break;
@@ -471,12 +506,17 @@ void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads,
 Wrapper of softmax backward with template instantiation on size of input.
 */
 template <typename T, typename VecT, bool LogMode>
-void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads,
-                               const platform::CUDADeviceContext& dev_ctx,
-                               T* dst, const T* grad, const T* src,
-                               const int batch_size, const int stride,
-                               const int element_count, int Log2Elements) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
+void SwitchWarpSoftmaxBackward(const int blocks,
+                               const dim3 threads,
+                               const GPUContext& dev_ctx,
+                               T* dst,
+                               const T* grad,
+                               const T* src,
+                               const int batch_size,
+                               const int stride,
+                               const int element_count,
+                               int Log2Elements) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   switch (Log2Elements) {
     SOFTMAX_WARP_BACKWARD_CASE(0, AccT);
     SOFTMAX_WARP_BACKWARD_CASE(1, AccT);
@@ -501,12 +541,12 @@ void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads,
  * Better performence when axis != -1
  */
 
-static void GetGridDim(int high_dim, int mid_dim, int low_dim,
-                       const dim3& block, dim3* grid) {
-  int device_id = paddle::platform::GetCurrentDeviceId();
-  int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+static void GetGridDim(
+    int high_dim, int mid_dim, int low_dim, const dim3& block, dim3* grid) {
+  int device_id = phi::backends::gpu::GetCurrentDeviceId();
+  int max_mp = phi::backends::gpu::GetGPUMultiProcessors(device_id);
   int max_threads_per_mp =
-      paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+      phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
   int max_threads = max_threads_per_mp * max_mp;
   int num_threads = block.x * block.y;
   int max_num_blocks = max_threads / num_threads;
@@ -532,16 +572,17 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
   block->x = std::min(block_x, static_cast<int>(max_num_threads / block->y));
 }
 
-static void GetLaunchConfig(int high_dim, int mid_dim, int low_dim, dim3* grid,
-                            dim3* block) {
+static void GetLaunchConfig(
+    int high_dim, int mid_dim, int low_dim, dim3* grid, dim3* block) {
   GetBlockDim(mid_dim, low_dim, block);
   GetGridDim(high_dim, mid_dim, low_dim, *block, grid);
 }
 
-template <typename T, typename AccT,
+template <typename T,
+          typename AccT,
           template <typename, typename> class Functor>
-__global__ void NormalSoftmaxForward(T* output, const T* input, int high_dim,
-                                     int mid_dim, int low_dim) {
+__global__ void NormalSoftmaxForward(
+    T* output, const T* input, int high_dim, int mid_dim, int low_dim) {
   using kMode = kps::details::ReduceMode;
   const int high_stride = mid_dim * low_dim;
   const int mid_stride = low_dim;
@@ -584,11 +625,15 @@ __global__ void NormalSoftmaxForward(T* output, const T* input, int high_dim,
   }
 }
 
-template <typename T, typename AccT,
+template <typename T,
+          typename AccT,
           template <typename, typename> class Functor>
-__global__ void NormalSoftmaxBackward(T* input_grad, const T* output_grad,
-                                      const T* output, int high_dim,
-                                      int mid_dim, int low_dim) {
+__global__ void NormalSoftmaxBackward(T* input_grad,
+                                      const T* output_grad,
+                                      const T* output,
+                                      int high_dim,
+                                      int mid_dim,
+                                      int low_dim) {
   using kMode = kps::details::ReduceMode;
   const int high_stride = mid_dim * low_dim;
   const int mid_stride = low_dim;
@@ -622,58 +667,79 @@ __global__ void NormalSoftmaxBackward(T* input_grad, const T* output_grad,
 }
 
 template <typename T, bool LogMode = false>
-void LaunchNormalSoftmaxForward(const platform::CUDADeviceContext& dev_ctx,
-                                T* output_data, const T* input_data,
-                                int high_dim, int mid_dim, int low_dim) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
+void LaunchNormalSoftmaxForward(const GPUContext& dev_ctx,
+                                T* output_data,
+                                const T* input_data,
+                                int high_dim,
+                                int mid_dim,
+                                int low_dim) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
     NormalSoftmaxForward<
-        T, AccT,
+        T,
+        AccT,
         LogSoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
         output_data, input_data, high_dim, mid_dim, low_dim);
   } else {
     NormalSoftmaxForward<
-        T, AccT, SoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+        T,
+        AccT,
+        SoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
         output_data, input_data, high_dim, mid_dim, low_dim);
   }
 }
 
 template <typename T, bool LogMode = false>
-void LaunchNormalSoftmaxBackward(const platform::CUDADeviceContext& dev_ctx,
-                                 T* input_grad_data, const T* output_grad_data,
-                                 const T* output_data, int high_dim,
-                                 int mid_dim, int low_dim) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
+void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
+                                 T* input_grad_data,
+                                 const T* output_grad_data,
+                                 const T* output_data,
+                                 int high_dim,
+                                 int mid_dim,
+                                 int low_dim) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
     NormalSoftmaxBackward<
-        T, AccT,
+        T,
+        AccT,
         LogSoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
-        input_grad_data, output_grad_data, output_data, high_dim, mid_dim,
+        input_grad_data,
+        output_grad_data,
+        output_data,
+        high_dim,
+        mid_dim,
         low_dim);
   } else {
     NormalSoftmaxBackward<
-        T, AccT, SoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
-        input_grad_data, output_grad_data, output_data, high_dim, mid_dim,
+        T,
+        AccT,
+        SoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+        input_grad_data,
+        output_grad_data,
+        output_data,
+        high_dim,
+        mid_dim,
         low_dim);
   }
 }
 
 template <typename T, bool LogMode = false>
-void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                                    const Tensor& x, const int input_axis,
-                                    Tensor* out) {
+void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
+                                    const DenseTensor& x,
+                                    const int input_axis,
+                                    DenseTensor* out) {
   auto* out_data = out->data<T>();
 
   auto dims = x.dims();
   const int rank = dims.size();
-  const int axis = CanonicalAxis(input_axis, rank);
+  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
   const int dim = dims[axis];
-  const int N = SizeToAxis(axis, dims);
-  const int D = SizeOutAxis(axis, dims);
+  const int N = phi::funcs::SizeToAxis(axis, dims);
+  const int D = phi::funcs::SizeOutAxis(axis, dims);
 
   constexpr int max_dim = 512;
   constexpr int warps_per_block = 4;
@@ -697,25 +763,43 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     using T2 = typename VecT2<T>::Type;
 
     if (dim % 4 == 0) {
-      SwitchWarpSoftmaxForward<T, T4, LogMode>(blocks, threads, dev_ctx,
-                                               out_data, x.data<T>(), N, dim,
-                                               dim, kDimLog2);
+      SwitchWarpSoftmaxForward<T, T4, LogMode>(blocks,
+                                               threads,
+                                               dev_ctx,
+                                               out_data,
+                                               x.data<T>(),
+                                               N,
+                                               dim,
+                                               dim,
+                                               kDimLog2);
     } else if (dim % 2 == 0) {
-      SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks, threads, dev_ctx,
-                                               out_data, x.data<T>(), N, dim,
-                                               dim, kDimLog2);
+      SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks,
+                                               threads,
+                                               dev_ctx,
+                                               out_data,
+                                               x.data<T>(),
+                                               N,
+                                               dim,
+                                               dim,
+                                               kDimLog2);
     } else {
-      SwitchWarpSoftmaxForward<T, T, LogMode>(blocks, threads, dev_ctx,
-                                              out_data, x.data<T>(), N, dim,
-                                              dim, kDimLog2);
+      SwitchWarpSoftmaxForward<T, T, LogMode>(blocks,
+                                              threads,
+                                              dev_ctx,
+                                              out_data,
+                                              x.data<T>(),
+                                              N,
+                                              dim,
+                                              dim,
+                                              kDimLog2);
     }
   } else if (D > 1) {
-    LaunchNormalSoftmaxForward<T, LogMode>(dev_ctx, out_data, x.data<T>(), N,
-                                           dim, D);
+    LaunchNormalSoftmaxForward<T, LogMode>(
+        dev_ctx, out_data, x.data<T>(), N, dim, D);
   } else {
     ScopedTensorDescriptor desc;
     std::vector<int> tensor_dims = {N, dim, D, 1};
-    DataLayout layout = DataLayout::kNCHW;
+    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
 #else
@@ -728,46 +812,74 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data,
-          MIOPEN_SOFTMAX_LOG, mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::miopenSoftmaxForward_V2(
+              handle,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              x.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              out_data,
+              MIOPEN_SOFTMAX_LOG,
+              mode));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data,
-          MIOPEN_SOFTMAX_ACCURATE, mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::miopenSoftmaxForward_V2(
+              handle,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              x.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              out_data,
+              MIOPEN_SOFTMAX_ACCURATE,
+              mode));
     }
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-          handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
-          desc_, x.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
+          handle,
+          CUDNN_SOFTMAX_LOG,
+          mode,
+          paddle::platform::CudnnDataType<T>::kOne(),
+          desc_,
+          x.data<T>(),
+          paddle::platform::CudnnDataType<T>::kZero(),
+          desc_,
           out_data));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
+          handle,
+          CUDNN_SOFTMAX_ACCURATE,
+          mode,
+          paddle::platform::CudnnDataType<T>::kOne(),
+          desc_,
+          x.data<T>(),
+          paddle::platform::CudnnDataType<T>::kZero(),
+          desc_,
+          out_data));
     }
 #endif
   }
 }
 
 template <typename T, bool LogMode = false>
-void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                                     const Tensor& out, const Tensor& dout,
-                                     const int input_axis, Tensor* dx) {
+void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
+                                     const DenseTensor& out,
+                                     const DenseTensor& dout,
+                                     const int input_axis,
+                                     DenseTensor* dx) {
   auto* dx_data = dx->data<T>();
 
   auto dims = out.dims();
   const int rank = dims.size();
-  const int axis = CanonicalAxis(input_axis, rank);
+  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
   const int dim = dims[axis];
-  const int N = SizeToAxis(axis, dims);
-  const int D = SizeOutAxis(axis, dims);
+  const int N = phi::funcs::SizeToAxis(axis, dims);
+  const int D = phi::funcs::SizeOutAxis(axis, dims);
 
   constexpr int max_dim = 512;
   constexpr int warps_per_block = 4;
@@ -788,25 +900,46 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     using T4 = typename VecT4<T>::Type;
     using T2 = typename VecT2<T>::Type;
     if (dim % 4 == 0) {
-      SwitchWarpSoftmaxBackward<T, T4, LogMode>(
-          blocks, threads, dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N,
-          dim, dim, kDimLog2);
+      SwitchWarpSoftmaxBackward<T, T4, LogMode>(blocks,
+                                                threads,
+                                                dev_ctx,
+                                                dx_data,
+                                                dout.data<T>(),
+                                                out.data<T>(),
+                                                N,
+                                                dim,
+                                                dim,
+                                                kDimLog2);
     } else if (dim % 2 == 0) {
-      SwitchWarpSoftmaxBackward<T, T2, LogMode>(
-          blocks, threads, dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N,
-          dim, dim, kDimLog2);
+      SwitchWarpSoftmaxBackward<T, T2, LogMode>(blocks,
+                                                threads,
+                                                dev_ctx,
+                                                dx_data,
+                                                dout.data<T>(),
+                                                out.data<T>(),
+                                                N,
+                                                dim,
+                                                dim,
+                                                kDimLog2);
     } else {
-      SwitchWarpSoftmaxBackward<T, T, LogMode>(
-          blocks, threads, dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N,
-          dim, dim, kDimLog2);
+      SwitchWarpSoftmaxBackward<T, T, LogMode>(blocks,
+                                               threads,
+                                               dev_ctx,
+                                               dx_data,
+                                               dout.data<T>(),
+                                               out.data<T>(),
+                                               N,
+                                               dim,
+                                               dim,
+                                               kDimLog2);
     }
   } else if (D > 1) {
-    LaunchNormalSoftmaxBackward<T, LogMode>(dev_ctx, dx_data, dout.data<T>(),
-                                            out.data<T>(), N, dim, D);
+    LaunchNormalSoftmaxBackward<T, LogMode>(
+        dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D);
   } else {
     ScopedTensorDescriptor desc;
     std::vector<int> tensor_dims = {N, dim, D, 1};
-    DataLayout layout = DataLayout::kNCHW;
+    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
 #else
@@ -819,33 +952,68 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(),
-          desc_, dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data, MIOPEN_SOFTMAX_LOG, mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::miopenSoftmaxBackward_V2(
+              handle,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              out.data<T>(),
+              desc_,
+              dout.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              dx_data,
+              MIOPEN_SOFTMAX_LOG,
+              mode));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(),
-          desc_, dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data, MIOPEN_SOFTMAX_ACCURATE, mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::miopenSoftmaxBackward_V2(
+              handle,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              out.data<T>(),
+              desc_,
+              dout.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              dx_data,
+              MIOPEN_SOFTMAX_ACCURATE,
+              mode));
     }
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
-          handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
-          desc_, out.data<T>(), desc_, dout.data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, dx_data));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnSoftmaxBackward(
+              handle,
+              CUDNN_SOFTMAX_LOG,
+              mode,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              out.data<T>(),
+              desc_,
+              dout.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              dx_data));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(), desc_,
-          dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnSoftmaxBackward(
+              handle,
+              CUDNN_SOFTMAX_ACCURATE,
+              mode,
+              paddle::platform::CudnnDataType<T>::kOne(),
+              desc_,
+              out.data<T>(),
+              desc_,
+              dout.data<T>(),
+              paddle::platform::CudnnDataType<T>::kZero(),
+              desc_,
+              dx_data));
     }
 #endif
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
new file mode 100644
index 0000000000000000000000000000000000000000..56e5fef6e37e41dd6405af25c214013211670246
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
@@ -0,0 +1,50 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxGradGPUDNNKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx, out, out_grad, axis, x_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradGPUDNNKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
new file mode 100644
index 0000000000000000000000000000000000000000..427d1729a13a8ea8e0caf4aa534b012af76e79f2
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxRawGPUDNNKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            int axis,
+                            DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  SoftmaxForwardCUDAKernelDriver<T>(dev_ctx, x, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(softmax,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawGPUDNNKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(softmax,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gumbel_softmax_grad_kernel.h b/paddle/phi/kernels/gumbel_softmax_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3f02d90fcb6ad347468fc4943d6ab445cb1c5f0
--- /dev/null
+++ b/paddle/phi/kernels/gumbel_softmax_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void GumbelSoftmaxGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             int axis,
+                             DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gumbel_softmax_kernel.h b/paddle/phi/kernels/gumbel_softmax_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..46edb9750dd34832b1c908822f6e322e548db951
--- /dev/null
+++ b/paddle/phi/kernels/gumbel_softmax_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void GumbelSoftmaxKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         float temperature,
+                         bool hard,
+                         int axis,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
index 4b31393a71f3623bff168dfc17612ceda250c506..78c25200bbd284489ee431cdb78a81748565050b 100644
--- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/abs_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -53,7 +53,7 @@ void AbsGradKernel(const Context& ctx,
   ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
   auto* dx_data = dx->data<T>();
 
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::funcs::AbsGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
   for_range(functor);
 }
@@ -70,7 +70,7 @@ void AbsDoubleGradKernel(const Context& ctx,
   ctx.template Alloc<T>(ddout, static_cast<size_t>(numel * sizeof(T)));
   auto* ddout_data = ddout->data<T>();
 
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::funcs::AbsGradGradFunctor<T> functor(
       ddx_data, x_data, ddout_data, numel);
   for_range(functor);
diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5efd22a31daa0def31102f46afce6a857ec1849
--- /dev/null
+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
+
+#include <type_traits>
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
+
+template <typename T, typename Context>
+void AddmmGradKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     float alpha,
+                     float beta,
+                     DenseTensor* input_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  auto in_dims = input.dims();
+  int total_elems = 0;
+
+  VLOG(3) << "alpha: " << alpha << " beta: " << beta;
+
+  if (input_grad != nullptr) {
+    input_grad->set_lod(out_grad.lod());
+  }
+  if (x_grad != nullptr) {
+    x_grad->set_lod(x.lod());
+  }
+  if (y_grad != nullptr) {
+    y_grad->set_lod(y.lod());
+  }
+
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+  if (input_grad) {
+    dev_ctx.template Alloc<T>(input_grad);
+    total_elems = in_dims[0] * in_dims[1];
+    auto& place = *dev_ctx.eigen_device();
+    auto eigen_dout = PhiEigenTensor<T, 2>::From(out_grad);
+    auto eigen_dinput = PhiEigenTensor<T, 2>::From(*input_grad);
+
+    bool row_compress = in_dims[0] != out_grad.dims()[0];
+    bool col_compress = in_dims[1] != out_grad.dims()[1];
+    auto eigen_dinput_shape =
+        Array2(input_grad->dims()[0], input_grad->dims()[1]);
+
+    if (row_compress && col_compress) {
+      eigen_dinput.device(place) =
+          eigen_dout.sum().eval().reshape(eigen_dinput_shape);
+    } else if (row_compress) {
+      eigen_dinput.device(place) =
+          eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape);
+    } else if (col_compress) {
+      eigen_dinput.device(place) =
+          eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape);
+    } else {
+      blas.VCOPY(total_elems, out_grad.data<T>(), input_grad->data<T>());
+    }
+
+    blas.SCAL(total_elems, beta, input_grad->data<T>());
+  }
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    total_elems = x.dims()[0] * x.dims()[1];
+    // x_grad = out_grad * y'. x_grad: M x K, out_grad : M x N, y : K x N
+    blas.MatMul(out_grad, false, y, true, x_grad);
+    blas.SCAL(total_elems, alpha, x_grad->data<T>());
+  }
+  if (y_grad) {
+    dev_ctx.template Alloc<T>(y_grad);
+    total_elems = x.dims()[1] * y.dims()[1];
+    // y_grad = x' * out_grad. y_grad K x N, out_grad : M x N, x : M x K
+    blas.MatMul(x, true, out_grad, false, y_grad);
+    blas.SCAL(total_elems, alpha, y_grad->data<T>());
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7afdfd622e63e88a99891d0e8cf5942f9454858
--- /dev/null
+++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+
+#include <type_traits>
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
+
+template <typename T, typename Context>
+void AddmmKernel(const Context& dev_ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 float alpha,
+                 float beta,
+                 DenseTensor* out) {
+  auto input_dims = input.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  // broadcast mode check
+  if (x_dims[0] != input_dims[0]) {
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      1,
+                      errors::InvalidArgument(
+                          "When x_dims[0] is not equal with input_dims[0], "
+                          "input_dims[0] must be 1 but got %s",
+                          input_dims[0]));
+    PADDLE_ENFORCE_EQ(y_dims[1] == input_dims[1] || input_dims[1] == 1,
+                      true,
+                      errors::InvalidArgument(
+                          "The input tensor shape mismatch, input shape=[%s], "
+                          "x shape=[%s], y shape=[%s]",
+                          input_dims,
+                          x_dims,
+                          y_dims));
+  }
+  // broadcast mode check
+  if (y_dims[1] != input_dims[1]) {
+    PADDLE_ENFORCE_EQ(input_dims[1],
+                      1,
+                      errors::InvalidArgument(
+                          "When y_dims[1] is not equal with input_dims[0], "
+                          "input_dims[0] must be 1 but got %s",
+                          input_dims[1]));
+    PADDLE_ENFORCE_EQ(x_dims[0] == input_dims[0] || input_dims[0] == 1,
+                      true,
+                      errors::InvalidArgument(
+                          "The input tensor shape mismatch, input shape=[%s], "
+                          "x shape=[%s], y shape=[%s]",
+                          input_dims,
+                          x_dims,
+                          y_dims));
+  }
+  // broadcast mode check
+  PADDLE_ENFORCE_EQ(
+      x_dims[1],
+      y_dims[0],
+      errors::InvalidArgument(
+          "The input tensor X's width must be equal with matrix Y' height. "
+          "But received X's shape = [%s], Y's shape = [%s].",
+          x_dims[1],
+          y_dims[0]));
+
+  dev_ctx.template Alloc<T>(out);
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+
+  // calc broadcast dim
+  Array2 bcast_dims;
+  bcast_dims[0] = x_dims[0] / input_dims[0];
+  bcast_dims[1] = y_dims[1] / input_dims[1];
+  VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]";
+  // broadcast using eigen
+  auto eigen_input = PhiEigenTensor<T, 2>::From(input);
+  auto eigen_out = PhiEigenTensor<T, 2>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
+      place, eigen_out, eigen_input, bcast_dims);
+
+  blas.GEMM(false,
+            false,
+            x_dims[0],
+            y_dims[1],
+            x_dims[1],
+            alpha,
+            x.data<T>(),
+            x_dims[1],
+            y.data<T>(),
+            y_dims[1],
+            beta,
+            out->data<T>(),
+            y_dims[1]);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
index 5f75a95f4a7b18f0ccf450e003860eeeef3c649d..d0dd18298518ab351918aa2492eb48d11d3cf1d7 100644
--- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -85,7 +85,7 @@ void Atan2GradKernel(const Context& ctx,
   auto* y_grad_data =
       ctx.template Alloc<T>(y_grad, size_t(y.numel() * sizeof(T)));
 
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::Atan2GradFunctor<T> functor(
       x_data, y_data, out_grad_data, x_grad_data, y_grad_data, numel);
   for_range(functor);
diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h
index c29449a27e0b5603c4e6f50c8ed676677c29796a..2cae914e2f61555377f7a41b3d89cdbb2b589247 100644
--- a/paddle/phi/kernels/impl/atan2_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 template <typename T>
@@ -80,7 +80,7 @@ void Atan2Kernel(const Context& ctx,
   auto* out_data = ctx.template Alloc<typename Atan2Out<T>::type>(
       out, size_t(x.numel() * sizeof(typename Atan2Out<T>::type)));
 
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::Atan2Functor<T> functor(x_data, y_data, out_data, numel);
   for_range(functor);
 }
diff --git a/paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c199833b42a99558966e785e74e172c3a0c1c14f
--- /dev/null
+++ b/paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BilinearTensorProductGradKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& y,
+                                     const DenseTensor& weight,
+                                     const DenseTensor& dout,
+                                     DenseTensor* dx,
+                                     DenseTensor* dy,
+                                     DenseTensor* dweight,
+                                     DenseTensor* dbias) {
+  auto batch_size = x.dims()[0];
+  auto weight_dims = weight.dims();
+  int out_dim = weight_dims[0];
+  auto x_dim = weight_dims[1];
+  auto y_dim = weight_dims[2];
+
+  auto x_mat = EigenMatrix<T>::From(x);
+  auto y_mat = EigenMatrix<T>::From(y);
+  auto dout_mat = EigenMatrix<T>::From(dout);
+  auto& place = *ctx.eigen_device();
+  // Create the intermediate variable to calculate the Output(Y@Grad).
+  DenseTensor x_scale;
+  x_scale.Resize(make_ddim({batch_size, x_dim}));
+  ctx.template Alloc<T>(&x_scale);
+  auto x_scale_mat = EigenMatrix<T>::From(x_scale);
+
+  // Create the intermediate variable to calculate the Output(X@Grad).
+  DenseTensor y_scale;
+  y_scale.Resize(make_ddim({batch_size, y_dim}));
+  ctx.template Alloc<T>(&y_scale);
+  auto y_scale_mat = EigenMatrix<T>::From(y_scale);
+
+  funcs::SetConstant<Context, T> set_zero;
+
+  if (dx) {
+    ctx.template Alloc<T>(dx);
+    set_zero(ctx, dx, static_cast<T>(0));
+  }
+
+  if (dy) {
+    ctx.template Alloc<T>(dy);
+    set_zero(ctx, dy, static_cast<T>(0));
+  }
+
+  if (dweight) {
+    ctx.template Alloc<T>(dweight);
+  }
+
+  auto blas = funcs::GetBlas<Context, T>(ctx);
+
+  // Caculate the Output(X@Grad) and Output(Y@Grad).
+  if (dx || dy || dweight) {
+    Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
+    Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+    Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+
+    for (int i = 0; i < out_dim; ++i) {
+      DenseTensor weight_i =
+          weight.Slice(i, i + 1).Resize(make_ddim({x_dim, y_dim}));
+      auto output_vec = dout_mat.chip(i, 1);
+
+      if (dx) {
+        y_scale_mat.device(place) =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_x) *
+            y_mat;
+        blas.GEMM(CblasNoTrans,
+                  CblasTrans,
+                  batch_size,
+                  x_dim,
+                  y_dim,
+                  1,
+                  y_scale.data<T>(),
+                  weight_i.data<T>(),
+                  1,
+                  dx->data<T>());
+      }
+
+      if (dy || dweight) {
+        auto output_vec_y =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_y);
+        x_scale_mat.device(place) = output_vec_y * x_mat;
+        if (dy) {
+          blas.GEMM(CblasNoTrans,
+                    CblasNoTrans,
+                    batch_size,
+                    y_dim,
+                    x_dim,
+                    1,
+                    x_scale.data<T>(),
+                    weight_i.data<T>(),
+                    1,
+                    dy->data<T>());
+        }
+        if (dweight) {
+          DenseTensor dweight_i =
+              dweight->Slice(i, i + 1).Resize(make_ddim({x_dim, y_dim}));
+          blas.GEMM(CblasTrans,
+                    CblasNoTrans,
+                    x_dim,
+                    y_dim,
+                    batch_size,
+                    1,
+                    x_scale.data<T>(),
+                    y.data<T>(),
+                    0,
+                    dweight_i.data<T>());
+        }
+      }
+    }
+  }
+
+  // calculate the gradient of Input(Bias).
+  if (dbias) {
+    ctx.template Alloc<T>(dbias);
+    auto dbias_mat = EigenVector<T>::Flatten(*dbias);
+    dbias_mat.device(place) = dout_mat.sum(Eigen::DSizes<int, 1>(0));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f30a4b958ebe05182bb3061e44312880e4cd8e3
--- /dev/null
+++ b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BilinearTensorProductKernel(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& weight,
+                                 paddle::optional<const DenseTensor&> bias,
+                                 DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+
+  auto y_mat = EigenMatrix<T>::From(y);
+  auto output_mat = EigenMatrix<T>::From(*out);
+
+  auto batch_size = x.dims()[0];
+  auto weight_dims = weight.dims();
+  int out_dim = weight_dims[0];
+  auto x_dim = weight_dims[1];
+  auto y_dim = weight_dims[2];
+  auto& place = *ctx.eigen_device();
+
+  // Create the intermediate variable to calculate the result of
+  // Input(X) multiplied by Input(Weight_i), the formula is:
+  // left_mul = X Weight_i.
+  DenseTensor left_mul;
+  left_mul.Resize(phi::make_ddim({batch_size, y_dim}));
+  ctx.template Alloc<T>(&left_mul);
+  auto left_mul_mat = EigenMatrix<T>::From(left_mul);
+
+  for (int i = 0; i < out_dim; ++i) {
+    auto output_col_vec = output_mat.chip(i, 1);
+    DenseTensor weight_mat =
+        weight.Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim}));
+    phi::funcs::GetBlas<Context, T>(ctx).GEMM(CblasNoTrans,
+                                              CblasNoTrans,
+                                              batch_size,
+                                              y_dim,
+                                              x_dim,
+                                              1,
+                                              x.data<T>(),
+                                              weight_mat.data<T>(),
+                                              0,
+                                              left_mul.data<T>());
+    output_col_vec.device(place) =
+        (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
+  }
+  if (bias.get_ptr()) {
+    auto bias_vec = EigenMatrix<T>::From(*(bias.get_ptr()));
+    Eigen::DSizes<int, 2> bcast(batch_size, 1);
+    output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8df86cc693445306885d765160e19b262b96cb3
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
@@ -0,0 +1,336 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+inline void TransCompute(const int dim,
+                         const Context& dev_ctx,
+                         const DenseTensor& in,
+                         DenseTensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      funcs::Transpose<Context, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      funcs::Transpose<Context, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      funcs::Transpose<Context, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      funcs::Transpose<Context, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      funcs::Transpose<Context, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      funcs::Transpose<Context, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      // for dim >= 7 situation
+      funcs::TransposeNormal<Context, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
+  }
+}
+
+/*! Use these functors to implement tril, triu, diagonal and other operators */
+template <typename T>
+struct EyeFunctor {
+  EyeFunctor(const int m, const int n, T* output)
+      : m_(m), n_(n), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int global_row = index / n_;
+    const int col = index - global_row * n_;
+    const int batch = global_row / m_;
+    const int row = global_row - batch * m_;
+    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
+  }
+
+  const int m_, n_;
+  T* output_;
+};
+
+template <typename T>
+struct MatrixSetDiagFunctor {
+  /*! Overwrite specified diagonals of output by the values in diagonal.
+   * diagonals can be a central band specified by num_diags and
+   * upper_diag_index, where upper_diag_index=0 refers to the main diagonal,
+   * positive value means superdiagonal and negative value means subdiagonal.
+   * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len]
+   * and the num_diags diagonals has a up to down layout. Otherwise it has a
+   * shape [i, j, ..., max_diag_len].
+   */
+  MatrixSetDiagFunctor(const int m,
+                       const int n,
+                       const int num_diags,
+                       const int max_diag_len,
+                       const int upper_diag_index,
+                       const T* diag,
+                       T* output)
+      : m_(m),
+        n_(n),
+        num_diags_(num_diags),
+        max_diag_len_(max_diag_len),
+        upper_diag_index_(upper_diag_index),
+        diag_(diag),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int batch_and_diag_index = index / max_diag_len_;
+    const int index_in_the_diagonal =
+        index - batch_and_diag_index * max_diag_len_;
+    const int batch = batch_and_diag_index / num_diags_;
+    const int diag_index_in_input = batch_and_diag_index - batch * num_diags_;
+    // diag_index=0 refers to the main diagonal
+    const int diag_index = upper_diag_index_ - diag_index_in_input;
+    // shift down for subdiagonal if diag_index < 0
+    const int y_index =
+        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
+    // shift right for superdiagonal if diag_index > 0
+    const int x_index =
+        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
+
+    // Upper-bound checks for diagonals shorter than max_diag_len.
+    // y_index and x_index are nonnegative by construction.
+    if (y_index < m_ && x_index < n_) {
+      const int out_index = batch * m_ * n_ + y_index * n_ + x_index;
+      output_[out_index] = diag_[index];
+    }
+  }
+
+  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
+  const T* diag_;
+  T* output_;
+};
+
+template <typename T>
+struct MatrixDiagPartFunctor {
+  /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0
+   * refers to the main diagonal, positive value means superdiagonal and
+   * negative value means subdiagonal */
+  MatrixDiagPartFunctor(const int m,
+                        const int n,
+                        const int num_diags,
+                        const int max_diag_len,
+                        const int upper_diag_index,
+                        const T padding,
+                        const T* input,
+                        T* output)
+      : m_(m),
+        n_(n),
+        num_diags_(num_diags),
+        max_diag_len_(max_diag_len),
+        upper_diag_index_(upper_diag_index),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int batch_and_mapped_diag_index = index / max_diag_len_;
+    const int index_in_the_diagonal =
+        index - batch_and_mapped_diag_index * max_diag_len_;
+    const int batch = batch_and_mapped_diag_index / num_diags_;
+    const int mapped_diag_index =
+        batch_and_mapped_diag_index - batch * num_diags_;
+    // diag_index=0 refers to the main diagonal
+    const int diag_index = upper_diag_index_ - mapped_diag_index;
+    // shift down for subdiagonal if diag_index < 0
+    const int y_index =
+        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
+    // shift right for superdiagonal if diag_index > 0
+    const int x_index =
+        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
+    if (y_index < m_ && x_index < n_) {
+      output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index];
+    } else {
+      output_[index] = padding_;
+    }
+  }
+
+  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
+  const T padding_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T>
+struct MatrixBandPartScaleEndFunctor {
+  /*! Compared with MatrixBandPartFunctor, it scale up values at the end of
+   * band. It can be used to fuse the following operations, which actually
+   * output triangular with diagonal scaled up:
+   * 1. dig = matrix_diag_part(middle)
+   * 2. middle = matrix_set_diag(middle, diag * scalar)
+   * 3. middle = matrix_band_part(middle, -1, 0)
+   */
+  MatrixBandPartScaleEndFunctor(const int m,
+                                const int n,
+                                const int num_lower_diags,
+                                const int num_upper_diags,
+                                const T scale,
+                                const T* input,
+                                T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        scale_(scale),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = 0;
+    } else if (col == band_end - 1) {
+      output_[index] = scale_ * input_[index];
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const T scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T>
+struct AddtoScaleFunctor {
+  AddtoScaleFunctor(const T scale, const T* input, T* output)
+      : scale_(scale), input_(input), output_(output) {}
+  HOSTDEVICE void operator()(size_t index) const {
+    output_[index] += input_[index];
+    output_[index] *= scale_;
+  }
+  const T scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T, typename Context>
+void CholeskyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out,
+                        const DenseTensor& out_grad,
+                        bool upper,
+                        DenseTensor* x_grad) {
+  auto* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+
+  auto& dims = out.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  auto m = dims[dims.size() - 1];
+  int tensor_size = batch_count * m * m;
+
+  std::vector<int> axis(dims.size() - 2);
+  std::iota(axis.begin(), axis.end(), 0);
+  axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2});
+  DenseTensor l, l_grad;
+  if (upper) {
+    l.Resize(dims);
+    dev_ctx.template Alloc<T>(&l);
+    l_grad.Resize(dims);
+    dev_ctx.template Alloc<T>(&l_grad);
+    TransCompute<Context, T>(dims.size(), dev_ctx, out, &l, axis);
+    TransCompute<Context, T>(dims.size(), dev_ctx, out_grad, &l_grad, axis);
+  } else {
+    l = out;
+    l_grad = out_grad;
+  }
+  auto* l_data = l.data<T>();
+
+  /*！ refer to Iain Murray (2016); arXiv 1602.07527 */
+  /*! phi = matmul(L.transpose(-1, -2), grad) */
+  DenseTensor middle;
+  middle.Resize(dims);
+  auto* middle_data = dev_ctx.template Alloc<T>(&middle);
+  auto trans_desc = funcs::CreateMatrixDescriptor(dims, 0, true);
+  auto no_trans_desc = funcs::CreateMatrixDescriptor(dims, 0, false);
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+  blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0));
+
+  /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */
+  paddle::platform::ForRange<Context> for_range(dev_ctx, tensor_size);
+  MatrixBandPartScaleEndFunctor<T> matrix_band_part_scale_end_functor(
+      m,
+      m,
+      /* num_lower_diags */ m,
+      /* num_upper_diags */ 0,
+      /* scale */ 0.5,
+      middle_data,
+      middle_data);
+  for_range(matrix_band_part_scale_end_functor);
+
+  // Compute inverse by solving the triangular linear system AX = B, where B
+  // is the identity matrix. The matrix X would be overwritten on B
+  DenseTensor identity;
+  identity.Resize(dims);
+  auto* identity_data = dev_ctx.template Alloc<T>(&identity);
+  EyeFunctor<T> eye_functor(m, m, identity_data);
+  for_range(eye_functor);
+  // TODO(guosheng): use trsmBatched for GPU
+  for (int i = 0; i < batch_count; i++) {
+    blas.TRSM(/*side*/ CblasLeft,
+              /*uplo*/ CblasLower,
+              /*trans*/ CblasNoTrans,
+              /*diag*/ CblasNonUnit,
+              /*m*/ m,
+              /*n*/ m,
+              /*alpha*/ T(1),
+              l_data + i * m * m,
+              /*lda*/ m,
+              identity_data + i * m * m,
+              /*ldb*/ m);
+  }
+  DenseTensor& l_inverse = identity;
+
+  /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */
+  DenseTensor middle1;
+  middle1.Resize(dims);
+  dev_ctx.template Alloc<T>(&middle1);
+  blas.MatMul(
+      l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1, T(0));
+  blas.MatMul(
+      middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad, T(0));
+
+  /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */
+  DenseTensor x_grad_trans;
+  x_grad_trans.Resize(dims);
+  auto* x_grad_trans_data = dev_ctx.template Alloc<T>(&x_grad_trans);
+  TransCompute<Context, T>(dims.size(), dev_ctx, *x_grad, &x_grad_trans, axis);
+  AddtoScaleFunctor<T> addto_scale_functor(0.5, x_grad_trans_data, x_grad_data);
+  for_range(addto_scale_functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
index febc464e6a1f5780ac6a25f0baa55449014a4f66..a10481284b17fbc21865ab8aa3b5ebad4e0a7d95 100644
--- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -28,7 +28,7 @@ void RealGradKernel(const Context& dev_ctx,
   auto* dx_data =
       dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
 
-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::RealToComplexFunctor<T> functor(dout_data, dx_data, numel);
   for_range(functor);
 }
@@ -42,7 +42,7 @@ void ImagGradKernel(const Context& dev_ctx,
   auto* dx_data =
       dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
 
-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::ImagToComplexFunctor<T> functor(dout_data, dx_data, numel);
   for_range(functor);
 }
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index 2f9b1ad04665378307b099f0fc3a0c75f487e41a..ff5cf86ed2ea240747f70f4410b339a135a49d3a 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -28,7 +28,7 @@ void ConjKernel(const Context& dev_ctx,
   auto* x_data = x.data<T>();
   auto* out_data = dev_ctx.template Alloc<T>(out);
 
-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::ConjFunctor<T> functor(x_data, numel, out_data);
   for_range(functor);
 }
@@ -42,7 +42,7 @@ void RealKernel(const Context& dev_ctx,
   auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
       out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
 
-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
   for_range(functor);
 }
@@ -56,7 +56,7 @@ void ImagKernel(const Context& dev_ctx,
   auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
       out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
 
-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
   for_range(functor);
 }
diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
index f94fe7168b2a5cb338f5fdc741d9be56b810f7c6..74ded1569eb5804950898bc1b824367b56480cda 100644
--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -47,7 +47,7 @@ void DigammaGradKernel(const Context& ctx,
   auto* x_data = x.data<T>();
   auto* dx_data = x_grad->data<T>();
   auto numel = out_grad.numel();
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   DigammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
   for_range(functor);
 }
diff --git a/paddle/phi/kernels/impl/digamma_kernel_impl.h b/paddle/phi/kernels/impl/digamma_kernel_impl.h
index 5a924a322d6e9941475854dbc01bc4b1d0084bb5..8994979e64d70753ba7b0a6a4debc5e48a95f243 100644
--- a/paddle/phi/kernels/impl/digamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -41,7 +41,7 @@ void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
   auto* x_data = x.data<T>();
   auto* out_data = out->data<T>();
   auto numel = x.numel();
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
   DigammaFunctor<T> functor(x_data, out_data, numel);
   for_range(functor);
 }
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index cafcb302d65b9add338851d6ffd6df56158230e0..460e74b58166a5132bdbd62703f4dc3d5ef34a91 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -36,12 +36,12 @@ void AddGradImpl(const Context& dev_ctx,
       x_grad->dims() == out_grad.dims()) {
     VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't "
                "reduce";
-    phi::Copy(dev_ctx, out_grad, false, x_grad);
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
   } else if (x_grad == nullptr && y_grad != nullptr &&
              y_grad->dims() == out_grad.dims()) {
     VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't "
                "reduce";
-    phi::Copy(dev_ctx, out_grad, false, y_grad);
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, y_grad);
   } else {
     grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis);
   }
diff --git a/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae76574e04e71ae722a4e306456a55c56e4464c1
--- /dev/null
+++ b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfinvGradKernel(const Context& ctx,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto eigen_out = EigenVector<T>::Flatten(out);
+  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
+  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *ctx.eigen_device();
+  constexpr T half_sqrt_pi = static_cast<T>(1 / M_2_SQRTPI);
+  eigen_dx.device(place) = half_sqrt_pi * eigen_dout * eigen_out.square().exp();
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/erfinv_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0fb8a01b997186026b0b3c59019e4a32b2bcafe
--- /dev/null
+++ b/paddle/phi/kernels/impl/erfinv_kernel_impl.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  auto eigen_in = EigenVector<T>::Flatten(x);
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto& place = *ctx.eigen_device();
+  constexpr T half = static_cast<T>(0.5);
+  constexpr T half_sqrt = static_cast<T>(M_SQRT1_2);
+  eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
index 889b560dd7398a2bc07f95b8ce607efc0e2372bd..766f91cd22e1f4584708c506b0ef5f742fdc366e 100644
--- a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
@@ -88,7 +88,7 @@ void ExpandGradKernel(const Context& ctx,
   }
   // no need reduce, just copy
   if (just_copy) {
-    phi::Copy(ctx, out_grad, false, in_grad);
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, in_grad);
   } else {
     PADDLE_ENFORCE_GE(dims,
                       1,
diff --git a/paddle/phi/kernels/impl/eye_kernel_impl.h b/paddle/phi/kernels/impl/eye_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..453652273a25b2140376712a673713b2f9fbe12b
--- /dev/null
+++ b/paddle/phi/kernels/impl/eye_kernel_impl.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct EyeFunctor {
+  EyeFunctor(int64_t num_columns, T* output)
+      : num_columns_(num_columns), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    output_[idx * num_columns_ + idx] = static_cast<T>(1);
+  }
+
+  int64_t num_columns_;
+  T* output_;
+};
+
+template <typename T, typename Context>
+void EyeKernel(const Context& ctx,
+               int64_t num_rows,
+               int64_t num_columns,
+               int dtype,
+               DenseTensor* out) {
+  auto num = num_columns;
+  if (num == -1) {
+    num = num_rows;
+  }
+  T* out_data = ctx.template Alloc<T>(out);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(ctx, out, static_cast<T>(0));
+  int64_t num_eyes = (std::min)(num_rows, num);
+  paddle::platform::ForRange<Context> for_range(ctx, num_eyes);
+  EyeFunctor<T> functor(num, out_data);
+  for_range(functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/full_kernel_impl.h b/paddle/phi/kernels/impl/full_kernel_impl.h
deleted file mode 100644
index 8cced49906eccdc41ccfb02518dcd06d771d23c9..0000000000000000000000000000000000000000
--- a/paddle/phi/kernels/impl/full_kernel_impl.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/common/scalar_array.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace phi {
-
-template <typename T, typename Context, typename VType>
-void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) {
-  dev_ctx.template Alloc<T>(tensor);
-  auto t = phi::EigenVector<T>::Flatten(*tensor);
-  t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(val));
-}
-
-template <typename T, typename Context>
-void FullKernel(const Context& dev_ctx,
-                const ScalarArray& shape,
-                const Scalar& val,
-                DenseTensor* out) {
-  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
-  FullValue<T>(dev_ctx, out, val.to<T>());
-}
-
-template <typename T, typename Context>
-void FullLikeKernel(const Context& dev_ctx,
-                    const Scalar& val,
-                    DenseTensor* out) {
-  auto value = val.to<float>();
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, phi::dtype::float16>::value,
-                                float,
-                                T>::type>::type;
-
-  auto common_type_value = static_cast<CommonType>(value);
-
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
-       static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
-      phi::errors::InvalidArgument(
-          "The filled value is out of range for target type, "
-          "current kernel type is %s, the range should between %f "
-          "and %f, but now value is %f.",
-          typeid(T).name(),
-          static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-          static_cast<CommonType>(std::numeric_limits<T>::max()),
-          static_cast<float>(value)));
-  FullValue<T>(dev_ctx, out, value);
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h b/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d57dd1002ac853093f089f8eaa7f78ac96de078
--- /dev/null
+++ b/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GumbelSoftmaxGradKernel(const Context& ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             int axis,
+                             DenseTensor* dx) {
+  const int rank = dx->dims().size();
+  axis = funcs::CanonicalAxis(axis, rank);
+  int axis_dim = dx->dims()[axis];
+  // allocate memory on device.
+
+  ctx.template Alloc<T>(dx);
+  if (dx->numel() == 0) {
+    return;
+  }
+
+  const int size_to_axis = funcs::SizeToAxis(axis, dx->dims());
+  const int size_from_axis = funcs::SizeFromAxis(axis, dx->dims());
+  DenseTensor dx_2d(*dx), out_2d(out), dout_2d(dout);
+  dx_2d.Resize({size_to_axis, size_from_axis});
+  out_2d.Resize({size_to_axis, size_from_axis});
+  dout_2d.Resize({size_to_axis, size_from_axis});
+  paddle::operators::math::SoftmaxGradFunctor<Context, T>()(
+      ctx, axis_dim, &out_2d, &dout_2d, &dx_2d);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2517d84898727bf07e64edb5960168e9d55e5d70
--- /dev/null
+++ b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <random>
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename Context, typename T, int64_t Rank>
+struct ArgMaxFunctor {
+  void operator()(const Context& ctx,
+                  const DenseTensor& in,
+                  DenseTensor* index_tensor,
+                  const int64_t& axis) {
+    auto in_eigen = EigenTensor<T, Rank>::From(in, in.dims());
+    auto index_eigen = EigenTensor<int, Rank - 1>::From(*index_tensor);
+    index_eigen = in_eigen.argmax(axis).template cast<int>();
+  }
+};
+
+template <typename Context, typename T>
+struct GumbleNoiseGenerator;
+
+template <typename Context, typename T>
+struct OneHotGenerator;
+
+template <typename T, typename Context>
+void GumbelSoftmaxKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         float temperature,
+                         bool hard,
+                         int axis,
+                         DenseTensor* out) {
+  const int rank = x.dims().size();
+  axis = funcs::CanonicalAxis(axis, rank);
+  int axis_dim = x.dims()[axis];
+
+  PADDLE_ENFORCE_GT(temperature,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The temperature must be greater than 0. But "
+                        "received temperature = %f",
+                        temperature));
+
+  // allocate memory on device.
+  ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+
+  const int size_to_axis = funcs::SizeToAxis(axis, x.dims());
+  const int size_from_axis = funcs::SizeFromAxis(axis, x.dims());
+  DenseTensor x_noise_2d, out_2d(*out);
+  x_noise_2d.Resize({size_to_axis, size_from_axis});
+  out_2d.Resize({size_to_axis, size_from_axis});
+
+  // generate gumbel noise and add it to X
+  auto* x_noise_data = ctx.template Alloc<T>(&x_noise_2d);
+  GumbleNoiseGenerator<Context, T>::Transform(ctx,
+                                              x.data<T>(),
+                                              x_noise_data,
+                                              size_to_axis,
+                                              size_from_axis,
+                                              temperature);
+
+#ifdef PADDLE_ON_INFERENCE
+  paddle::operators::math::SoftmaxFunctor<Context, T, true>()(
+      ctx, axis_dim, &x_noise_2d, &out_2d);
+#else
+  paddle::operators::math::SoftmaxFunctor<Context, T, false>()(
+      ctx, axis_dim, &x_noise_2d, &out_2d);
+#endif
+
+  if (hard) {
+    OneHotGenerator<Context, T>::Transform(ctx, x, out, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/increment_kernel_impl.h b/paddle/phi/kernels/impl/increment_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0756807a87532812fdd7ff2ad6fc1bd8a125aa26
--- /dev/null
+++ b/paddle/phi/kernels/impl/increment_kernel_impl.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/increment_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IncrementKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     float value,
+                     DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  auto& dev = *dev_ctx.eigen_device();
+  funcs::EigenAdd<std::decay_t<decltype(dev)>, T>::Eval(
+      dev,
+      EigenScalar<T>::From(*out),
+      EigenScalar<T>::From(x),
+      static_cast<T>(value));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 8b94fa1d22eb5254dcb1e92eb7bd98bfe368b4ae..f2549c171dda00ecab0baf8b6a7cdfb26ddea4d0 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -596,7 +596,6 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
         ddout_flag = true;
       }
     }
-
     if (ddy) {
       auto ddy_mat = ddy.get();
       if (ddy_mat.dims() != y_help.dims()) {
diff --git a/paddle/phi/kernels/impl/mv_kernel_impl.h b/paddle/phi/kernels/impl/mv_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1754ea323ceb959c721f5e4c27058a652e1575c1
--- /dev/null
+++ b/paddle/phi/kernels/impl/mv_kernel_impl.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MvKernel(const Context& dev_ctx,
+              const DenseTensor& x,
+              const DenseTensor& vec,
+              DenseTensor* out) {
+  auto dim_x = x.dims();
+
+  // get data ptr
+  const T* x_data = x.data<T>();
+  const T* vec_data = vec.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  blas.GEMV(false,
+            dim_x[0],
+            dim_x[1],
+            static_cast<T>(1),
+            x_data,
+            vec_data,
+            static_cast<T>(0),
+            out_data);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..db19a04337932a88388da9cdeb32abc0c4fc0466
--- /dev/null
+++ b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelShuffleGradKernel(const Context& ctx,
+                            const DenseTensor& out_grad,
+                            int upscale_factor,
+                            const std::string& data_format,
+                            DenseTensor* x_grad) {
+  auto* dout = &out_grad;
+  auto* dx = x_grad;
+  ctx.template Alloc<T>(dx);
+  int factor = upscale_factor;
+  bool channel_last = (data_format == "NHWC");
+  auto do_dims = dout->dims();
+  auto dx_dims = dx->dims();
+
+  DenseTensor t(*dout);
+  if (!channel_last) {
+    t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
+  } else {
+    t.Resize({do_dims[0], dx_dims[1], factor, dx_dims[2], factor, do_dims[3]});
+  }
+  std::vector<int> axis = {0, 1, 3, 5, 2, 4};
+
+  DenseTensor o(*dx);
+  if (!channel_last) {
+    o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
+  } else {
+    o.Resize({do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor});
+  }
+  phi::funcs::Transpose<Context, T, 6> trans;
+  trans(ctx, t, &o, axis);
+  dx->Resize(dx_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2303db4ea57d6833cd70894e0dd0842e00585a8e
--- /dev/null
+++ b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelShuffleKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        int upscale_factor,
+                        const std::string& data_format,
+                        DenseTensor* out) {
+  auto* in = &x;
+  ctx.template Alloc<T>(out);
+  int factor = upscale_factor;
+  bool channel_last = (data_format == "NHWC");
+  auto in_dims = in->dims();
+  auto o_dims = out->dims();
+
+  DenseTensor t(*in);
+  if (!channel_last) {
+    t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
+  } else {
+    t.Resize({in_dims[0], in_dims[1], in_dims[2], o_dims[3], factor, factor});
+  }
+  std::vector<int> axis = {0, 1, 4, 2, 5, 3};
+
+  DenseTensor o(*out);
+  if (!channel_last) {
+    o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
+  } else {
+    o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]});
+  }
+  phi::funcs::Transpose<Context, T, 6> trans;
+  trans(ctx, t, &o, axis);
+  out->Resize(o_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/size_kernel_impl.h b/paddle/phi/kernels/impl/size_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b781dba3ad2365de3c0f6ba52a746243300e573
--- /dev/null
+++ b/paddle/phi/kernels/impl/size_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SizeKernel(const Context& ctx,
+                const DenseTensor& input,
+                DenseTensor* out) {
+  auto place = ctx.GetPlace();
+  auto out_data = ctx.template Alloc<int64_t>(out);
+  auto cpu_place = phi::CPUPlace();
+  if (place == cpu_place) {
+    out_data[0] = input.numel();
+  } else {
+    DenseTensor cpu_tensor;
+    cpu_tensor.Resize(out->dims());
+    auto cpu_data = ctx.template HostAlloc<int64_t>(&cpu_tensor);
+    cpu_data[0] = input.numel();
+    phi::Copy(ctx, cpu_tensor, place, false, out);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..915bf16a92df183a1f6321361bfc0c5c7fc394b1
--- /dev/null
+++ b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       DenseTensor* x_grad) {
+  const int rank = x_grad->dims().size();
+  const int calc_axis = phi::funcs::CanonicalAxis(axis, rank);
+  int axis_dim = x_grad->dims()[calc_axis];
+
+  // allocate memory on device.
+  dev_ctx.template Alloc<T>(x_grad);
+  if (x_grad->numel() == 0) {
+    return;
+  }
+
+  const int n = phi::funcs::SizeToAxis(calc_axis, x_grad->dims());
+  const int d = phi::funcs::SizeFromAxis(calc_axis, x_grad->dims());
+  DenseTensor dX_2d, Out_2d, dOut_2d;
+  dX_2d.ShareDataWith(*x_grad).Resize({n, d});
+  Out_2d.ShareDataWith(out).Resize({n, d});
+  dOut_2d.ShareDataWith(out_grad).Resize({n, d});
+
+  paddle::operators::math::SoftmaxGradFunctor<Context, T>()(
+      dev_ctx, axis_dim, &Out_2d, &dOut_2d, &dX_2d);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6552f6ed581f45008f01c02fad3c007bf3664942
--- /dev/null
+++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out) {
+  const int rank = x.dims().size();
+  const int calc_axis = phi::funcs::CanonicalAxis(axis, rank);
+  int axis_dim = x.dims()[calc_axis];
+
+  // allocate memory on device.
+  dev_ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+
+  const int n = phi::funcs::SizeToAxis(calc_axis, x.dims());
+  const int d = phi::funcs::SizeFromAxis(calc_axis, x.dims());
+  DenseTensor X_2d, Out_2d;
+  X_2d.ShareDataWith(x).Resize({n, d});
+  Out_2d.ShareDataWith(*out).Resize({n, d});
+  paddle::operators::math::SoftmaxFunctor<Context, T, false>()(
+      dev_ctx, axis_dim, &X_2d, &Out_2d);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
index 5263f92cb578b9cda612e7bfa4edb2b425876b20..b0878d779462a9c351caa038af2ac017bbf4a14f 100644
--- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
@@ -21,7 +21,7 @@
 
 #include <algorithm>
 
-#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
@@ -130,7 +130,7 @@ void TraceGradKernel(const Context& ctx,
     const auto* input_arr = input_stride.Get();
 #endif
 
-    paddle::platform::ForRange<Context> for_range(ctx, in_grad->numel());
+    phi::funcs::ForRange<Context> for_range(ctx, in_grad->numel());
     TraceGradFunctor<T> functor(out_data,
                                 output_arr,
                                 input_arr,
diff --git a/paddle/phi/kernels/impl/unbind_kernel_impl.h b/paddle/phi/kernels/impl/unbind_kernel_impl.h
index 8a1342559bd908bf197e4949ce66f9b3e504b499..3e233a2038e48098d8c78bf81d922a812a87187a 100644
--- a/paddle/phi/kernels/impl/unbind_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unbind_kernel_impl.h
@@ -20,7 +20,7 @@
 namespace phi {
 
 template <typename T, typename Context>
-void UnbindKernel(const Context& ctx,
+void UnbindKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   int axis,
                   std::vector<DenseTensor*> outs) {
@@ -29,12 +29,12 @@ void UnbindKernel(const Context& ctx,
 
   std::vector<const DenseTensor*> shape_refer;
   for (size_t j = 0; j < outs.size(); ++j) {
-    ctx.template Alloc<T>(outs[j]);
+    dev_ctx.template Alloc<T>(outs[j]);
     shape_refer.emplace_back(outs[j]);
   }
 
   phi::funcs::SplitFunctor<Context, T> functor;
-  functor(ctx, x, shape_refer, axis, &outs);
+  functor(dev_ctx, x, shape_refer, axis, &outs);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/increment_kernel.h b/paddle/phi/kernels/increment_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c5bc2a20279106905edca0209f325cf2c7f1e78
--- /dev/null
+++ b/paddle/phi/kernels/increment_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IncrementKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     float value,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/masked_select_grad_kernel.h b/paddle/phi/kernels/masked_select_grad_kernel.h
index fd16091a665ca983cd5185eb7e12a2928052794a..f9db1fcd2acc7a7924d9b9e393550a74d0d0ac81 100644
--- a/paddle/phi/kernels/masked_select_grad_kernel.h
+++ b/paddle/phi/kernels/masked_select_grad_kernel.h
@@ -24,4 +24,4 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                             const DenseTensor& mask,
                             DenseTensor* x_grad);
 
-}  // namspace pten
+}  // namspace phi
diff --git a/paddle/phi/kernels/masked_select_kernel.h b/paddle/phi/kernels/masked_select_kernel.h
index abd3c318986d81cb14c0f8ecdd449faf1b48cf3a..471f650690d367da132e0ad2e8da441394b7aff2 100644
--- a/paddle/phi/kernels/masked_select_kernel.h
+++ b/paddle/phi/kernels/masked_select_kernel.h
@@ -23,4 +23,4 @@ void MaskedSelectKernel(const Context& dev_ctx,
                         const DenseTensor& mask,
                         DenseTensor* out);
 
-}  // namspace pten
+}  // namspace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index db6c5e1ac35919c153c8021c82e747cc3ca9fe37..3cb7b66ddf73e5fa3c5502a4acaad2c277a22ac6 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -121,7 +121,8 @@ PD_REGISTER_KERNEL(subtract,
                    int,
                    int64_t,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(divide,
                    CPU,
                    ALL_LAYOUT,
@@ -142,7 +143,8 @@ PD_REGISTER_KERNEL(multiply,
                    int64_t,
                    bool,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   phi::dtype::bfloat16) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(mean,
@@ -180,6 +182,7 @@ PD_REGISTER_KERNEL(add,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    complex64,
                    complex128) {}
 PD_REGISTER_KERNEL(subtract,
diff --git a/paddle/phi/kernels/multinomial_kernel.h b/paddle/phi/kernels/multinomial_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3d8770bc1b606cb13e23256b1d9dabd7f172df0
--- /dev/null
+++ b/paddle/phi/kernels/multinomial_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultinomialKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int num_samples,
+                       bool replacement,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mv_grad_kernel.h b/paddle/phi/kernels/mv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..edc73d89367ff9d074bbbaef5af38f8222e57de9
--- /dev/null
+++ b/paddle/phi/kernels/mv_grad_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// dX = | dOut vec^T
+// dVec = | X^T dOut
+template <typename T, typename Context>
+void MvGradKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& vec,
+                  const DenseTensor& out_grad,
+                  DenseTensor* x_grad,
+                  DenseTensor* vec_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mv_kernel.h b/paddle/phi/kernels/mv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab4f0b82794ab3ba154c072496cb897fc1416b84
--- /dev/null
+++ b/paddle/phi/kernels/mv_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MvKernel(const Context& ctx,
+              const DenseTensor& x,
+              const DenseTensor& vec,
+              DenseTensor* out);
+
+}  // namepsace phi
diff --git a/paddle/phi/kernels/pixel_shuffle_grad_kernel.h b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..be57de5da40530e8aa96aff23638e8f6613b5f0a
--- /dev/null
+++ b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelShuffleGradKernel(const Context& ctx,
+                            const DenseTensor& out_grad,
+                            int upscale_factor,
+                            const std::string& data_format,
+                            DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pixel_shuffle_kernel.h b/paddle/phi/kernels/pixel_shuffle_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..18b9ab9c21fdc7fbccbfe8d15152e09006a34e37
--- /dev/null
+++ b/paddle/phi/kernels/pixel_shuffle_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelShuffleKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        int upscale_factor,
+                        const std::string& data_format,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/randint_kernel.h b/paddle/phi/kernels/randint_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfefc628614fbc03c484a43f31a7194da15a2bf9
--- /dev/null
+++ b/paddle/phi/kernels/randint_kernel.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandintKernel(const Context& dev_ctx,
+                   int low,
+                   int high,
+                   const ScalarArray& shape,
+                   DataType dtype,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void RandintRawKernel(const Context& dev_ctx,
+                      int low,
+                      int high,
+                      const ScalarArray& shape,
+                      DataType dtype,
+                      int seed,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/randperm_kernel.h b/paddle/phi/kernels/randperm_kernel.h
index 63bdac6da6fdc12955e3743f23941840696a0ce6..70b95db98bef95f364802ad14a54966dc47d13fe 100644
--- a/paddle/phi/kernels/randperm_kernel.h
+++ b/paddle/phi/kernels/randperm_kernel.h
@@ -20,7 +20,11 @@
 namespace phi {
 
 template <typename T, typename Context>
-void RandpermKernel(const Context& ctx,
+void RandpermRawKernel(
+    const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out);
+
+template <typename T, typename Context>
+void RandpermKernel(const Context& dev_ctx,
                     int n,
                     DataType dtype,
                     DenseTensor* out);
diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc
index 5361315bb611b02375da52e7bbe00e1f7ee3f4ed..38132966407dce23a3665d22708820df016deb9c 100644
--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -24,7 +24,7 @@ void ReshapeGradKernel(const Context& dev_ctx,
                        const DenseTensor& out_grad,
                        DenseTensor* x_grad) {
   auto x_dims = x_grad->dims();
-  phi::Copy(dev_ctx, out_grad, false, x_grad);
+  phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
   x_grad->Resize(x_dims);
 }
 
diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc
index 570e70ea11227111a7343003d8043a3407841f19..f758d7c70518f067188242fdc9f014b5b414e885 100644
--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -36,7 +36,7 @@ void ReshapeKernel(const Context& dev_ctx,
   // TODO(chenweihang): the output dims are overwrite after copying,
   // here we need to use copy method that only copy data
   auto dims = out->dims();
-  phi::Copy(dev_ctx, x, false, out);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
   out->Resize(dims);
   out->ResetLoD(x.lod());
 }
diff --git a/paddle/phi/kernels/size_kernel.h b/paddle/phi/kernels/size_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d7a29104db0813f4d4dca340575d0c1a5885d4c
--- /dev/null
+++ b/paddle/phi/kernels/size_kernel.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SizeKernel(const Context& ctx, const DenseTensor& input, DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/softmax_grad_kernel.h b/paddle/phi/kernels/softmax_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ecf65c1f17c789b028b3a0a8ad270cca7aa69d9
--- /dev/null
+++ b/paddle/phi/kernels/softmax_grad_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/softmax_kernel.h b/paddle/phi/kernels/softmax_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca69d652770aacd01191f5c3ca685276f0f2336f
--- /dev/null
+++ b/paddle/phi/kernels/softmax_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void SoftmaxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int axis,
+                   DataType dtype,
+                   DenseTensor* out) {
+  auto cast_x = phi::Cast<T, Context>(dev_ctx, x, dtype);
+  phi::SoftmaxRawKernel<T, Context>(dev_ctx, axis, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..71160a6365dc778e40476af960f21443cac698e5
--- /dev/null
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+struct Dims4D {
+  int dims[4];
+  Dims4D(const int batch, const int x, const int y, const int z) {
+    dims[0] = batch;
+    dims[1] = z;
+    dims[2] = y;
+    dims[3] = x;
+  }
+  HOSTDEVICE const int& operator[](int i) const { return dims[i]; }
+};
+
+// Judge whether the current position x is in (lower, upper)
+inline HOSTDEVICE bool Check(const int& x,
+                             const int& kx,
+                             const int& pad,
+                             const int& stride,
+                             const int dilation,
+                             const int kdim,
+                             const int xdim) {
+  const int lower = x - dilation * kx + pad;
+  const int uper = x + (kdim - kx - 1) * dilation - pad;
+  return (lower >= 0 && lower % stride == 0 && uper < xdim);
+}
+
+// Check whether the current position(x, y, z) is legal:
+// Judge the minimum and maximum values at each latitude
+inline HOSTDEVICE bool Check(const Dims4D& dims,
+                             const Dims4D& kernel_dims,
+                             const Dims4D& paddings,
+                             const Dims4D& dilations,
+                             const Dims4D& strides,
+                             const int x,
+                             const int y,
+                             const int z,
+                             const int kx,
+                             const int ky,
+                             const int kz) {
+  bool x_valid = Check(
+      x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]);
+  bool y_valid = Check(
+      y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]);
+  bool z_valid = Check(
+      z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]);
+  return (x_valid && y_valid && z_valid);
+}
+
+template <typename Dim>
+inline HOSTDEVICE int PointToIndex(const int& batch,
+                                   const int& x,
+                                   const int& y,
+                                   const int& z,
+                                   const Dim& dims) {
+  return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
+         y * dims[3] + x;
+}
+
+template <typename Dim>
+inline HOSTDEVICE void IndexToPoint(
+    const int index, const Dim& dims, int* batch, int* x, int* y, int* z) {
+  int n = index;
+  *x = n % dims[3];
+  n /= dims[3];
+  *y = n % dims[2];
+  n /= dims[2];
+  *z = n % dims[1];
+  n /= dims[1];
+  *batch = n;
+}
+
+inline void GetOutShape(const DDim& x_dims,
+                        const DDim& kernel_dims,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DDim* out_dims) {
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
+  PADDLE_ENFORCE_EQ(kernel_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "the shape of kernel should be (D, H, W, C, OC)"));
+
+  // infer out shape
+  (*out_dims)[0] = x_dims[0];
+  (*out_dims)[4] = kernel_dims[4];
+  for (int i = 1; i < 4; i++) {
+    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
+                      dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) /
+                         strides[i - 1] +
+                     1;
+  }
+}
+
+template <typename T, typename Context>
+void Conv3dKernel(const Context& dev_ctx,
+                  const SparseCooTensor& x,
+                  const DenseTensor& kernel,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const int groups,
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook);
+
+template <typename T, typename Context>
+SparseCooTensor Conv3d(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor kernel,
+                       const std::vector<int>& paddings,
+                       const std::vector<int>& dilations,
+                       const std::vector<int>& strides,
+                       const int groups,
+                       DenseTensor* rulebook) {
+  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  SparseCooTensor coo(indices, values, x.dims());
+  Conv3dKernel<T, Context>(
+      dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
new file mode 100644
index 0000000000000000000000000000000000000000..5803069d927d70947d8bc7c3d6af051d7ea1b81c
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -0,0 +1,181 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <set>
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace sparse {
+
+// such as: kernel(3, 3, 3), kernel_size = 27
+// counter_per_weight: (kernel_size)
+// TODO(zhangkaihuo): optimize performance with multithreading
+template <typename T, typename Context>
+void ProductRuleBook(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const DDim& out_dims,
+                     DenseTensor* rulebook,
+                     DenseTensor* counter_per_kernel) {
+  const auto& kernel_dims = kernel.dims();
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  dev_ctx.Alloc(counter_per_kernel,
+                counter_per_kernel->dtype(),
+                sizeof(int) * counter_per_kernel->numel());
+  int* counter_ptr = counter_per_kernel->data<int>();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  memset(counter_ptr, 0, kernel_size * sizeof(int));
+
+  int rulebook_len = 0;
+  // calc the rulebook_len
+  const auto& x_dims = x.dims();
+  const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]);
+  const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
+  const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]);
+
+  auto f_calc_rulebook = [&](int* rulebook_ptr) {
+    int kernel_index = 0, rulebook_index = 0;
+    for (int kz = 0; kz < kernel_dims[0]; kz++) {
+      for (int ky = 0; ky < kernel_dims[1]; ky++) {
+        for (int kx = 0; kx < kernel_dims[2]; kx++) {
+          for (int64_t i = 0; i < non_zero_num; i++) {
+            int batch = indices_ptr[i];
+            int in_z = indices_ptr[i + non_zero_num];
+            int in_y = indices_ptr[i + 2 * non_zero_num];
+            int in_x = indices_ptr[i + 3 * non_zero_num];
+            int out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0];
+            int out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1];
+            int out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2];
+            if (Check(c_x_dims,
+                      c_kernel_dims,
+                      c_paddings,
+                      c_dilations,
+                      c_strides,
+                      in_x,
+                      in_y,
+                      in_z,
+                      kx,
+                      ky,
+                      kz)) {
+              if (rulebook_ptr == nullptr) {
+                counter_ptr[kernel_index] += 1;
+                ++rulebook_len;
+              } else {
+                rulebook_ptr[rulebook_index] = kernel_index;
+                rulebook_ptr[rulebook_index + rulebook_len] = i;  // in_i
+                rulebook_ptr[rulebook_index + rulebook_len * 2] =
+                    PointToIndex<DDim>(
+                        batch, out_x, out_y, out_z, out_dims);  // out_index
+                ++rulebook_index;
+              }
+            }
+          }
+          ++kernel_index;
+        }
+      }
+    }
+  };
+
+  f_calc_rulebook(nullptr);
+  // alloc the rulebook
+  rulebook->ResizeAndAllocate({3, rulebook_len});
+  dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int));
+  int* rulebook_ptr = rulebook->data<int>();
+  f_calc_rulebook(rulebook_ptr);
+}
+
+template <typename T, typename Context>
+void UpdateRulebookAndOutIndex(const Context& dev_ctx,
+                               const SparseCooTensor& x,
+                               const int kernel_size,
+                               const int out_channels,
+                               const DDim& out_dims,
+                               DenseTensor* rulebook,
+                               SparseCooTensor* out) {
+  std::set<int> out_indexs;
+  int n = rulebook->dims()[1];
+  int* rulebook_ptr = rulebook->data<int>();
+  for (int i = 0; i < n; i++) {
+    out_indexs.insert(rulebook_ptr[i + n * 2]);
+  }
+
+  int out_non_zero_num = out_indexs.size();
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, out_channels}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+  dev_ctx.Alloc(
+      &out_indices, out_indices.dtype(), out_indices.numel() * sizeof(int));
+  int* out_indices_ptr = out_indices.data<int>();
+  int i = 0;
+  for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
+    const int index = *it;
+    int batch, x, y, z;
+    IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
+    out_indices_ptr[i] = batch;
+    out_indices_ptr[i + out_non_zero_num] = z;
+    out_indices_ptr[i + out_non_zero_num * 2] = y;
+    out_indices_ptr[i + out_non_zero_num * 3] = x;
+  }
+  for (i = 0; i < n; i++) {
+    int out_index = rulebook_ptr[i + n * 2];
+    rulebook_ptr[i + n * 2] =
+        std::distance(out_indexs.begin(), out_indexs.find(out_index));
+  }
+
+  out->SetMember(out_indices, out_values, out_dims, true);
+}
+
+template <typename T>
+void Gather(
+    const T* x, const int* indexs, const int n, const int channels, T* out) {
+  for (int i = 0; i < n; i++) {
+    int real_i = indexs[i];
+    memcpy(out + i * channels, x + real_i * channels, channels * sizeof(T));
+  }
+}
+
+template <typename T>
+void Scatter(
+    const T* x, const int* indexs, const int n, const int channels, T* out) {
+  for (int i = 0; i < n; i++) {
+    int real_i = indexs[i];
+    for (int j = 0; j < channels; j++) {
+      out[real_i * channels + j] += x[i * channels + j];
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fdf255bd542e66245b44b2ec906dc207ee51a422
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void Conv3dKernel(const Context& dev_ctx,
+                  const SparseCooTensor& x,
+                  const DenseTensor& kernel,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const int groups,
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook) {
+  // update padding and dilation
+  // Currently, only support x.layout is NDHWC, groups = 1
+  // if x.layout != NDHWC then transpose(x), transpose(weight)
+
+  const auto& x_dims = x.dims();
+  const auto& kernel_dims = kernel.dims();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  DDim out_dims = {1, 1, 1, 1, 1};
+  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+
+  // Second algorithm:
+  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
+  // 1. product rulebook
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  // DenseTensor rulebook = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+
+  ProductRuleBook<T, Context>(dev_ctx,
+                              x,
+                              kernel,
+                              paddings,
+                              dilations,
+                              strides,
+                              out_dims,
+                              rulebook,
+                              &counter_per_kernel);
+
+  UpdateRulebookAndOutIndex<T>(
+      dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out);
+
+  int n = rulebook->dims()[1];
+  const int* counter_ptr = counter_per_kernel.data<int>();
+
+  // 2. gather
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {n, in_channels}, DataLayout::NHWC);
+  DenseTensorMeta out_features_meta(
+      x.dtype(), {n, out_channels}, DataLayout::NHWC);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor out_features =
+      phi::Empty(dev_ctx, std::move(out_features_meta));
+  dev_ctx.Alloc(&in_features, x.dtype(), sizeof(T) * in_features.numel());
+  dev_ctx.Alloc(&out_features, x.dtype(), sizeof(T) * out_features.numel());
+  T* in_features_ptr = in_features.data<T>();
+  T* out_features_ptr = out_features.data<T>();
+
+  Gather<T>(x.non_zero_elements().data<T>(),
+            rulebook->data<int>() + n,
+            n,
+            in_channels,
+            in_features_ptr);
+
+  // 3. call gemm for every werght
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  std::vector<int> offsets(kernel_size + 1);
+  int offset = 0;
+  for (int i = 0; i < kernel_size; i++) {
+    offsets[i] = offset;
+    offset += counter_ptr[i];
+  }
+  offsets[kernel_size] = offset;
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter_ptr[i] <= 0) {
+      continue;
+    }
+
+    // call gemm: (n, in_channels) * (in_channels, out_channels)
+    const int M = counter_ptr[i];
+    const int K = in_channels;   // in_channels
+    const int N = out_channels;  // out_channels
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
+    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    blas.GEMM(CblasNoTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_out_ptr);
+  }
+
+  // 4. scatter
+  dev_ctx.Alloc(out->mutable_non_zero_elements(),
+                out->mutable_non_zero_elements()->dtype(),
+                sizeof(T) * in_features.numel());
+  T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
+  memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
+  Scatter<T>(out_features_ptr,
+             rulebook->data<int>() + n * 2,
+             n,
+             out_channels,
+             out_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    sparse_conv3d, CPU, ALL_LAYOUT, phi::sparse::Conv3dKernel, float, double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 1e2c70a9cf39bf0df738a74b301afcc0fcbd8699..2e741111fb1489aef5bdc51de637b77eec9d28a7 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -86,19 +86,6 @@ __global__ void GetNonZeroElementsAndIndices(const T* dense_data,
   }
 }
 
-template <typename Context>
-void GetGpuLaunchConfig1D(const Context& dev_ctx,
-                          const int64_t n,
-                          int* grid_size,
-                          int* block_size) {
-  const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
-  const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
-  *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM
-                                     : (1 << static_cast<int>(std::log2(n)));
-  *grid_size = n / *block_size;
-  *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size;
-}
-
 template <typename T, typename Context>
 void DenseToSparseCooKernel(const Context& dev_ctx,
                             const DenseTensor& x,
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index b5201e16f548d594af47aa9a4611d35f9cf2ad4f..d96d134a26b08a0208122a7ea9a62ce07c033d51 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -40,6 +40,19 @@ inline const DDim InferDenseDims(const DDim& x_dims,
   return values_dims;
 }
 
+template <typename Context>
+inline void GetGpuLaunchConfig1D(const Context& dev_ctx,
+                                 const int64_t n,
+                                 int* grid_size,
+                                 int* block_size) {
+  const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
+  const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+  *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM
+                                     : (1 << static_cast<int>(std::log2(n)));
+  *grid_size = n / *block_size;
+  *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size;
+}
+
 template <typename T, typename Context>
 void DenseToSparseCooKernel(const Context& dev_ctx,
                             const DenseTensor& x,
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index c981ca115850707857ed1f25a9e546138d9d950c..60df877355b8268efafddfdc2b452617cdadf9df 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -69,7 +69,7 @@ void TransferLayoutKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(pten_transfer_layout,
+PD_REGISTER_GENERAL_KERNEL(phi_transfer_layout,
                            CPU,
                            ALL_LAYOUT,
                            phi::TransferLayoutKernel<phi::CPUContext>,
diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0370cc431fef9cab69861b7f707f65c897e20fa2
--- /dev/null
+++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <random>
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+// reference: https://gist.github.com/lakshayg/d80172fe5ae3c5d2c2aedb53c250320e
+template <typename T>
+T Erfinv(T x) {
+  if (x < -1 || x > 1) {
+    return std::numeric_limits<T>::quiet_NaN();
+  } else if (x == 1.0) {
+    return std::numeric_limits<T>::infinity();
+  } else if (x == -1.0) {
+    return -std::numeric_limits<T>::infinity();
+  }
+
+  const T LN2 = 6.931471805599453094172321214581e-1;
+
+  const T A0 = 1.1975323115670912564578e0;
+  const T A1 = 4.7072688112383978012285e1;
+  const T A2 = 6.9706266534389598238465e2;
+  const T A3 = 4.8548868893843886794648e3;
+  const T A4 = 1.6235862515167575384252e4;
+  const T A5 = 2.3782041382114385731252e4;
+  const T A6 = 1.1819493347062294404278e4;
+  const T A7 = 8.8709406962545514830200e2;
+
+  const T B0 = 1.0000000000000000000e0;
+  const T B1 = 4.2313330701600911252e1;
+  const T B2 = 6.8718700749205790830e2;
+  const T B3 = 5.3941960214247511077e3;
+  const T B4 = 2.1213794301586595867e4;
+  const T B5 = 3.9307895800092710610e4;
+  const T B6 = 2.8729085735721942674e4;
+  const T B7 = 5.2264952788528545610e3;
+
+  const T C0 = 1.42343711074968357734e0;
+  const T C1 = 4.63033784615654529590e0;
+  const T C2 = 5.76949722146069140550e0;
+  const T C3 = 3.64784832476320460504e0;
+  const T C4 = 1.27045825245236838258e0;
+  const T C5 = 2.41780725177450611770e-1;
+  const T C6 = 2.27238449892691845833e-2;
+  const T C7 = 7.74545014278341407640e-4;
+
+  const T D0 = 1.4142135623730950488016887e0;
+  const T D1 = 2.9036514445419946173133295e0;
+  const T D2 = 2.3707661626024532365971225e0;
+  const T D3 = 9.7547832001787427186894837e-1;
+  const T D4 = 2.0945065210512749128288442e-1;
+  const T D5 = 2.1494160384252876777097297e-2;
+  const T D6 = 7.7441459065157709165577218e-4;
+  const T D7 = 1.4859850019840355905497876e-9;
+
+  const T E0 = 6.65790464350110377720e0;
+  const T E1 = 5.46378491116411436990e0;
+  const T E2 = 1.78482653991729133580e0;
+  const T E3 = 2.96560571828504891230e-1;
+  const T E4 = 2.65321895265761230930e-2;
+  const T E5 = 1.24266094738807843860e-3;
+  const T E6 = 2.71155556874348757815e-5;
+  const T E7 = 2.01033439929228813265e-7;
+
+  const T F0 = 1.414213562373095048801689e0;
+  const T F1 = 8.482908416595164588112026e-1;
+  const T F2 = 1.936480946950659106176712e-1;
+  const T F3 = 2.103693768272068968719679e-2;
+  const T F4 = 1.112800997078859844711555e-3;
+  const T F5 = 2.611088405080593625138020e-5;
+  const T F6 = 2.010321207683943062279931e-7;
+  const T F7 = 2.891024605872965461538222e-15;
+
+  T abs_x = abs(x);
+
+  if (abs_x <= 0.85) {
+    T r = 0.180625 - 0.25 * x * x;
+    T num =
+        (((((((A7 * r + A6) * r + A5) * r + A4) * r + A3) * r + A2) * r + A1) *
+             r +
+         A0);
+    T den =
+        (((((((B7 * r + B6) * r + B5) * r + B4) * r + B3) * r + B2) * r + B1) *
+             r +
+         B0);
+    return x * num / den;
+  }
+
+  T r = sqrt(LN2 - log(1.0 - abs_x));
+
+  T num, den;
+  if (r <= 5.0) {
+    r = r - 1.6;
+    num =
+        (((((((C7 * r + C6) * r + C5) * r + C4) * r + C3) * r + C2) * r + C1) *
+             r +
+         C0);
+    den =
+        (((((((D7 * r + D6) * r + D5) * r + D4) * r + D3) * r + D2) * r + D1) *
+             r +
+         D0);
+  } else {
+    r = r - 5.0;
+    num =
+        (((((((E7 * r + E6) * r + E5) * r + E4) * r + E3) * r + E2) * r + E1) *
+             r +
+         E0);
+    den =
+        (((((((F7 * r + F6) * r + F5) * r + F4) * r + F3) * r + F2) * r + F1) *
+             r +
+         F0);
+  }
+
+  if (x < 0) {
+    return -num / den;
+  } else {
+    return num / den;
+  }
+}
+
+template <typename T>
+struct TruncatedNormal {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
+    auto normal_cdf = [](T x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf(-2.0);
+    b_normal_cdf = normal_cdf(2.0);
+  }
+
+  T operator()(T value) const {
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
+  }
+};
+
+template <typename T, typename Context>
+void TruncatedGaussianRandomKernel(const Context& ctx,
+                                   const ScalarArray& shape,
+                                   float mean,
+                                   float std,
+                                   int seed,
+                                   DataType dtype,
+                                   DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/xpu/copy_kernel.cc b/paddle/phi/kernels/xpu/copy_kernel.cc
index 58efbafc88bee0933a364ff9872604c94174305f..fb931ef18a85668ce49d02dc9730cbf3b1436113 100644
--- a/paddle/phi/kernels/xpu/copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/copy_kernel.cc
@@ -27,12 +27,19 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
           const DenseTensor& src,
+          Place dst_place,
           bool blocking,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
-  auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  void* dst_ptr = nullptr;
+
+  dst->Resize(src.dims());
+  if (paddle::platform::is_cpu_place(dst_place)) {
+    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+  } else {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  }
   const auto& src_place = src.place();
-  const auto& dst_place = dst->place();
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
@@ -43,7 +50,7 @@ void Copy(const Context& dev_ctx,
 
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
-  dst->ResizeAndAllocate(src.dims());
+
   CHECK(dst->layout() == src.layout());
   auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
 
diff --git a/paddle/phi/ops/compat/addmm_sig.cc b/paddle/phi/ops/compat/addmm_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3bc0bb23a71e25aafe1c2e5038a60fdcf865a12
--- /dev/null
+++ b/paddle/phi/ops/compat/addmm_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "addmm_grad",
+      {"Input", "X", "Y", GradVarName("Out")},
+      {"Alpha", "Beta"},
+      {GradVarName("Input"), GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(addmm_grad, phi::AddmmGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc b/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..570bf7ce943d6de8693639bacf50c5883b2ec4e2
--- /dev/null
+++ b/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BilinearTensorProductOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}, {}, {"Out"});
+}
+
+KernelSignature BilinearTensorProductGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("bilinear_tensor_product_grad",
+                         {"X", "Y", "Weight", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"),
+                          GradVarName("Y"),
+                          GradVarName("Weight"),
+                          GradVarName("Bias")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product,
+                           phi::BilinearTensorProductOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product_grad,
+                           phi::BilinearTensorProductGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cholesky_sig.cc b/paddle/phi/ops/compat/cholesky_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c7ca75704669bf3af3c3b698deb8f61a6501693
--- /dev/null
+++ b/paddle/phi/ops/compat/cholesky_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CholeskyGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cholesky_grad",
+                         {"Out", GradVarName("Out")},
+                         {"upper"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cholesky_grad, phi::CholeskyGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a14b9095c8343f47e1d6aa039c9aced963984ce
--- /dev/null
+++ b/paddle/phi/ops/compat/diag_sig.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag);
+
+PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/erfinv_sig.cc b/paddle/phi/ops/compat/erfinv_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..490573191533f506bce082b264a9cf0520125d67
--- /dev/null
+++ b/paddle/phi/ops/compat/erfinv_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ErfinvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "erfinv_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(erfinv_grad, phi::ErfinvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/eye_sig.cc b/paddle/phi/ops/compat/eye_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6dafb642795d116ad85e4544a0b8c4a8496d6291
--- /dev/null
+++ b/paddle/phi/ops/compat/eye_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature EyeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "eye", {}, {"num_rows", "num_columns", "dtype"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(eye, phi::EyeOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gumbel_softmax_sig.cc b/paddle/phi/ops/compat/gumbel_softmax_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7585a4e5f39acc2d7793526f6a5ca7948c370f3
--- /dev/null
+++ b/paddle/phi/ops/compat/gumbel_softmax_sig.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GumbelSoftmaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("gumbel_softmax_grad",
+                         {"Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gumbel_softmax_grad,
+                           phi::GumbelSoftmaxGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mv_sig.cc b/paddle/phi/ops/compat/mv_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab0d31ee31dab99125a28d5b9f662d25d8e408d0
--- /dev/null
+++ b/paddle/phi/ops/compat/mv_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"});
+}
+
+KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("mv_grad",
+                         {"X", "Vec", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Vec")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/pixel_shuffle_sig.cc b/paddle/phi/ops/compat/pixel_shuffle_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..641288cf12ae2e44147f6bd35434a6661727e9cd
--- /dev/null
+++ b/paddle/phi/ops/compat/pixel_shuffle_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PixelShuffleOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "pixel_shuffle", {"X"}, {"upscale_factor", "data_format"}, {"Out"});
+}
+
+KernelSignature PixelShuffleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("pixel_shuffle_grad",
+                         {GradVarName("Out")},
+                         {"upscale_factor", "data_format"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle, phi::PixelShuffleOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle_grad,
+                           phi::PixelShuffleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/randint_sig.cc b/paddle/phi/ops/compat/randint_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb6da78a258bc415b54fd128655bae422b3b711c
--- /dev/null
+++ b/paddle/phi/ops/compat/randint_sig.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RandintOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  int seed = paddle::any_cast<int>(ctx.Attr("seed"));
+  if (seed) {
+    if (ctx.InputSize("ShapeTensorList") > 0) {
+      return KernelSignature(
+          "randint_raw",
+          {},
+          {"low", "high", "ShapeTensorList", "seed", "dtype"},
+          {"Out"});
+    } else {
+      const auto& shape =
+          paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+      if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+        return KernelSignature("randint_raw",
+                               {},
+                               {"low", "high", "ShapeTensor", "seed", "dtype"},
+                               {"Out"});
+      } else {
+        return KernelSignature("randint_raw",
+                               {},
+                               {"low", "high", "shape", "seed", "dtype"},
+                               {"Out"});
+      }
+    }
+  } else {
+    if (ctx.InputSize("ShapeTensorList") > 0) {
+      return KernelSignature(
+          "randint", {}, {"low", "high", "ShapeTensorList", "dtype"}, {"Out"});
+    } else {
+      const auto& shape =
+          paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+      if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+        return KernelSignature(
+            "randint", {}, {"low", "high", "ShapeTensor", "dtype"}, {"Out"});
+      } else {
+        return KernelSignature(
+            "randint", {}, {"low", "high", "shape", "dtype"}, {"Out"});
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(randint, phi::RandintOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/randperm_sig.cc b/paddle/phi/ops/compat/randperm_sig.cc
index 14b28512e402a377b4f2f8f7d8f1e90f7ef37b71..89548beff6762a59edbe317383714502b4382efe 100644
--- a/paddle/phi/ops/compat/randperm_sig.cc
+++ b/paddle/phi/ops/compat/randperm_sig.cc
@@ -17,7 +17,12 @@
 namespace phi {
 
 KernelSignature RandpermOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"});
+  int seed = paddle::any_cast<int>(ctx.Attr("seed"));
+  if (seed) {
+    return KernelSignature("randperm", {}, {"n", "dtype", "seed"}, {"Out"});
+  } else {
+    return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"});
+  }
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/scale_sig.cc b/paddle/phi/ops/compat/scale_sig.cc
index 915ea4ce302aea6a4a11f1c0745229fb09c1d8c8..95deb007d99d9c42bbc2cc22faed2a44fa58b0f5 100644
--- a/paddle/phi/ops/compat/scale_sig.cc
+++ b/paddle/phi/ops/compat/scale_sig.cc
@@ -20,7 +20,7 @@ namespace phi {
  * Note [ Why does the ArgumentMapping function need to be so complicated? ]
  *
  * In order to meet the requirements of infrt, the function used to match Op
- * and Kernel parameters, need to be placed in pten as a compatible component,
+ * and Kernel parameters, need to be placed in phi as a compatible component,
  * and does not depend on fluid.
  *
  * Because infrt not only needs to dynamically call this argument mapping
diff --git a/paddle/phi/ops/compat/softmax_sig.cc b/paddle/phi/ops/compat/softmax_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65a915b51d08a85acf16d4206faa765dc6434d8c
--- /dev/null
+++ b/paddle/phi/ops/compat/softmax_sig.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("softmax", {"X"}, {"axis"}, {"Out"});
+}
+
+KernelSignature SoftmaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("softmax_grad",
+                         {"Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(softmax, phi::SoftmaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(softmax_grad, phi::SoftmaxGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/truncated_gaussian_random_sig.cc b/paddle/phi/ops/compat/truncated_gaussian_random_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c4d47f8c7221f0447159aef34812d92b4b80ff6
--- /dev/null
+++ b/paddle/phi/ops/compat/truncated_gaussian_random_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TruncatedGaussianRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("truncated_gaussian_random",
+                         {},
+                         {"shape", "mean", "std", "seed", "dtype"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(truncated_gaussian_random,
+                           phi::TruncatedGaussianRandomOpArgumentMapping);
diff --git a/paddle/phi/tests/CMakeLists.txt b/paddle/phi/tests/CMakeLists.txt
index ab5da613199be8d021392f9f5db6d8eed38c30b7..3bc13e55eb8a21edb87758c55e0f3d19475d4850 100644
--- a/paddle/phi/tests/CMakeLists.txt
+++ b/paddle/phi/tests/CMakeLists.txt
@@ -2,4 +2,4 @@ add_subdirectory(api)
 add_subdirectory(common)
 add_subdirectory(core)
 add_subdirectory(kernels)
-add_subdirectory(ops_signature)
+add_subdirectory(ops)
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index d875dbd4444ae664472663caa0ea5b2694ca8e4f..cde085423e482e62a280815700ead9a0b6c64262 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -1,27 +1,27 @@
 if(WITH_ROCM)
-  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api manual_api glog)
+  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
 else()
-  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api manual_api glog)
+  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
 endif()
 
-cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
+cc_test(test_phi_exception SRCS test_pten_exception.cc DEPS gtest)
 
-cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_empty_api SRCS test_empty_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_cast_api SRCS test_cast_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_to_api SRCS test_to_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_slice_api SRCS test_slice_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_sum_api SRCS test_sum_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_conj_api SRCS test_conj_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_concat_api SRCS test_concat_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_split_api SRCS test_split_api.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_data_transform SRCS test_data_transform.cc DEPS pten_tensor pten_api pten_api_utils)
-cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_empty_api SRCS test_empty_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_cast_api SRCS test_cast_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_to_api SRCS test_to_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_slice_api SRCS test_slice_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_sum_api SRCS test_sum_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_conj_api SRCS test_conj_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index b6ca081e9786650bf9a446e4a47e00c9e7a36edb..d93f00129b9a14170b979dfd23eb6e292e996ce8 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -42,7 +42,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
       kernel_layout == DataLayout::UNDEFINED ||
       kernel_data_type == DataType::UNDEFINED) {
     auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
     if (kernel_backend == Backend::UNDEFINED) {
       kernel_backend = kernel_key.backend();
     }
@@ -71,7 +71,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
 
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(kernel_backend)),
+          phi::TransToPhiPlace(kernel_backend)),
       phi::DenseTensorMeta());
   phi::MetaTensor meta_out(dense_out.get());
   phi::UnchangedInferMeta(*dense_x, &meta_out);
@@ -215,7 +215,7 @@ Tensor scale_switch_case(const Tensor& x,
       kernel_layout == DataLayout::UNDEFINED ||
       kernel_data_type == DataType::UNDEFINED) {
     auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
     if (kernel_backend == Backend::UNDEFINED) {
       kernel_backend = kernel_key.backend();
     }
@@ -238,7 +238,7 @@ Tensor scale_switch_case(const Tensor& x,
 
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(kernel_backend)),
+          phi::TransToPhiPlace(kernel_backend)),
       phi::DenseTensorMeta());
   phi::MetaTensor meta_out(dense_out.get());
   phi::UnchangedInferMeta(*dense_x, &meta_out);
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index 3df1866efb0df69fcccf9f18a13235940db2b35b..a3c497bd427ae040b33dce241a70ecaafee5fbcc 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -84,7 +83,7 @@ TEST(Tensor, data_transform_diff_place) {
   ASSERT_EQ(out.layout(), phi::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
   ASSERT_EQ(out.impl()->place(),
-            phi::TransToPtenPlace(experimental::Backend::GPU));
+            phi::TransToPhiPlace(experimental::Backend::GPU));
 
   auto ref_out = experimental::copy_to(out, experimental::Backend::CPU, true);
 
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index fd8a127b7c77d922b44acb8c7a72cd3da9941321..2a3dd9c7dff62071fcd7dcf18cddcc5946ff7480 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
+#include "paddle/phi/api/backward/backward_api.h"
 #include "paddle/phi/api/include/api.h"
 
 #include "paddle/phi/api/lib/utils/allocator.h"
@@ -127,8 +128,8 @@ TEST(API, matmul_cuda) {
   auto place = paddle::platform::CUDAPlace();
   auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.GetByPlace(place));
 
-  phi::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get());
-  phi::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get());
+  phi::Copy(*dev_ctx, *ref_x.get(), phi::GPUPlace(), false, dense_x.get());
+  phi::Copy(*dev_ctx, *ref_y.get(), phi::GPUPlace(), false, dense_y.get());
 
   paddle::experimental::Tensor x(dense_x);
   paddle::experimental::Tensor y(dense_y);
@@ -152,7 +153,7 @@ TEST(API, matmul_cuda) {
       phi::DenseTensorMeta(
           phi::DataType::FLOAT32, out.dims(), phi::DataLayout::NCHW));
 
-  phi::Copy(*dev_ctx, *dense_out.get(), false, ref_out.get());
+  phi::Copy(*dev_ctx, *dense_out.get(), phi::CPUPlace(), false, ref_out.get());
 
   for (size_t i = 0; i < 9; i++) {
     ASSERT_NEAR(sum[i], ref_out->data<float>()[i], 1e-6f);
@@ -161,5 +162,31 @@ TEST(API, matmul_cuda) {
 
 #endif
 
+TEST(API, matmul_double_grad) {
+  // 1. create tensor
+  auto x = paddle::experimental::full({3, 3}, 1.0);
+  auto y = paddle::experimental::full({3, 3}, 2.0);
+  auto out_grad = paddle::experimental::full({3, 3}, 2.0);
+  auto dx_grad = paddle::experimental::full({3, 3}, 2.0);
+
+  // 2. test API
+  const auto out = paddle::experimental::matmul_double_grad(
+      x, y, out_grad, dx_grad, {}, false, false);
+
+  // 3. check result
+  ASSERT_EQ(out.size(), 3UL);
+  ASSERT_EQ(out[0].size(), 1UL);
+  ASSERT_EQ(out[1].size(), 1UL);
+  ASSERT_EQ(out[2].size(), 1UL);
+  ASSERT_EQ(out[0][0].dims()[1], 3);
+  ASSERT_EQ(out[0][0].numel(), 9);
+  ASSERT_EQ(out[1][0].numel(), 9);
+  ASSERT_EQ(out[2][0].numel(), 9);
+  ASSERT_EQ(out[0][0].type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(out[0][0].layout(), phi::DataLayout::NCHW);
+  ASSERT_EQ(out[1][0].initialized(), true);
+  ASSERT_EQ(out[2][0].initialized(), true);
+}
+
 }  // namespace tests
 }  // namespace paddle
diff --git a/paddle/phi/tests/api/test_pten_tensor.cc b/paddle/phi/tests/api/test_pten_tensor.cc
index de88561c4d675c137afb3fab664342f15de72c86..dc2883c1794e2c986ed5446981b749f5f4dd0bc2 100644
--- a/paddle/phi/tests/api/test_pten_tensor.cc
+++ b/paddle/phi/tests/api/test_pten_tensor.cc
@@ -211,7 +211,7 @@ void TestJudgeTensorType() {
   CHECK(test_tensor.is_dense_tensor() == true);
 }
 
-TEST(PtenTensor, All) {
+TEST(PhiTensor, All) {
   VLOG(2) << "TestCopy";
   GroupTestCopy();
   VLOG(2) << "TestDtype";
diff --git a/paddle/phi/tests/api/test_split_api.cc b/paddle/phi/tests/api/test_split_api.cc
index 9b3478e85e04ca08ac0c3c687068ea53b20e662d..0b836a010586d775ced1e7c196b1b5139ac42fc1 100644
--- a/paddle/phi/tests/api/test_split_api.cc
+++ b/paddle/phi/tests/api/test_split_api.cc
@@ -17,7 +17,6 @@
 
 #include "paddle/phi/api/include/api.h"
 
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index c790e7bfa71da4820e793827984da83ddf59743c..d337a0b601a00d6ae0423b7184d4d2c5cc6ef2b8 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/phi/api/include/manual_api.h"
+#include "paddle/phi/api/include/api.h"
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt
index f54b37cb976c54a784d25c438614afcc14300a81..710ea3c06647205171289812be6fa7f18a8fb8d0 100644
--- a/paddle/phi/tests/common/CMakeLists.txt
+++ b/paddle/phi/tests/common/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_test(pten_test_backend SRCS test_backend.cc DEPS gtest)
-cc_test(pten_test_data_layout SRCS test_data_layout.cc DEPS gtest)
-cc_test(pten_test_data_type SRCS test_data_type.cc DEPS gtest)
-cc_test(pten_test_place SRCS test_place.cc DEPS pten_place)
+cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest)
+cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest)
+cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest)
+cc_test(phi_test_place SRCS test_place.cc DEPS phi_place)
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index d74a35c9eae2e13b23e41099c75e5362ecd690db..fa4ffc84bf587defae06deb18dae283a64206b75 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -41,8 +41,8 @@ TEST(Backend, OStream) {
   oss << phi::Backend::MKLDNN;
   EXPECT_EQ(oss.str(), "MKLDNN");
   oss.str("");
-  oss << phi::Backend::CUDNN;
-  EXPECT_EQ(oss.str(), "CUDNN");
+  oss << phi::Backend::GPUDNN;
+  EXPECT_EQ(oss.str(), "GPUDNN");
   oss.str("");
   try {
     oss << phi::Backend::NUM_BACKENDS;
@@ -60,7 +60,7 @@ TEST(Backend, StringToBackend) {
   EXPECT_EQ(phi::Backend::XPU, pexp::StringToBackend("XPU"));
   EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
   EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
-  EXPECT_EQ(phi::Backend::CUDNN, pexp::StringToBackend("CUDNN"));
+  EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN"));
   EXPECT_EQ(static_cast<phi::Backend>(
                 static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
             pexp::StringToBackend("CustomBackend"));
diff --git a/paddle/phi/tests/common/test_place.cc b/paddle/phi/tests/common/test_place.cc
index c311a6733b04df645c0ee4c70e04b9f635377b04..ed2eb7126ed289c0eb31f4ac14be8492515afa60 100644
--- a/paddle/phi/tests/common/test_place.cc
+++ b/paddle/phi/tests/common/test_place.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace phi {
 namespace tests {
 
-TEST(PtenPlace, place) {
+TEST(PhiPlace, place) {
   phi::Place place;
   EXPECT_EQ(place.GetType(), phi::AllocationType::UNDEFINED);
 
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 576ab7ffe6a66660a523905c3ab2c0fb13de1943..5356bac9fbd808f1f75eb13f4406d6d0661e60bd 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS pten_custom_kernel)
+cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS phi_custom_kernel)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
@@ -6,7 +6,7 @@ cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scal
 cc_test(test_sparse_coo_tensor SRCS test_sparse_coo_tensor.cc DEPS dense_tensor sparse_coo_tensor)
 cc_test(test_sparse_csr_tensor SRCS test_sparse_csr_tensor.cc DEPS dense_tensor sparse_csr_tensor)
 cc_test(test_op_utils SRCS test_op_utils.cc DEPS op_compat_infos)
-cc_test(test_pten_device_context SRCS test_device_context.cc DEPS pten_context cpu_context)
+cc_test(test_phi_device_context SRCS test_device_context.cc DEPS phi_context cpu_context)
 cc_test(test_meta_fn_utils SRCS test_meta_fn_utils.cc DEPS dense_tensor wrapped_infermeta infermeta infermeta_utils)
 
 cc_test(test_ddim SRCS test_ddim.cc DEPS ddim)
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index bc75e6ec45245eceb7f919cd96fd4e76f0af9409..d8e42c9d0d8b11d393dbb71776671d9cb50a7715 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -148,9 +148,9 @@ TEST(CustomKernel, custom_kernel_dot) {
   // 3.before register
   auto& kernel_factory_instance = phi::KernelFactory::Instance();
   auto& kernels = phi::KernelFactory::Instance().kernels();
-  EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePtenKernel(op_name));
+  EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name));
 
-  // mock fake_dot is supported by phi for HasCompatiblePtenKernel check while
+  // mock fake_dot is supported by phi for HasCompatiblePhiKernel check while
   // registering
   auto& fake_dot_kernels = kernels[op_name];
 
@@ -251,7 +251,7 @@ TEST(CustomKernel, custom_kernel_dot) {
   phi::dtype::float16 fake_attr_f16 = phi::dtype::float16(5);
   phi::DataType fake_attr_dtype = phi::DataType::UINT32;
   paddle::framework::LoDTensor tmp_tensor;
-  tmp_tensor.mutable_data<uint8_t>({1}, phi::TransToPtenPlace(backend));
+  tmp_tensor.mutable_data<uint8_t>({1}, phi::TransToPhiPlace(backend));
   phi::Scalar fake_attr_scalar{tmp_tensor};
   phi::ScalarArray fake_attr_scalar_array;
   std::vector<int64_t> fake_attr_int64_vec;
@@ -271,7 +271,7 @@ TEST(CustomKernel, custom_kernel_dot) {
 
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
+          phi::TransToPhiPlace(backend)),
       phi::DenseTensorMeta());
 
   phi::MetaTensor meta_out(dense_out.get());
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index 9682e063471dfac551aa2c844506878f1c97ef46..c92e10f8dd74af072bb8836d65898e2fc9a79bcc 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -1,18 +1,19 @@
-cc_test(test_copy_dev_api SRCS test_copy_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_dot_dev_api SRCS test_dot_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_creation_dev_api SRCS test_creation_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_matmul_dev_api SRCS test_matmul_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_cast_dev_api SRCS test_cast_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_copy_dev_api SRCS test_copy_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_dot_dev_api SRCS test_dot_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_creation_dev_api SRCS test_creation_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_matmul_dev_api SRCS test_matmul_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_cast_dev_api SRCS test_cast_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_sparse_conv3d_dev_api SRCS test_sparse_conv3d_dev_api.cc DEPS phi phi_api_utils)
 
 cc_test(test_math_function SRCS test_math_function.cc DEPS math_function)
 if(WITH_GPU)
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index 4cd283d925ab442fbf54aad85f4954caf0735816..d69c7b2174f726d5757ea707678ddb383cf19d68 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -62,7 +62,8 @@ TEST(DEV_API, copy) {
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
   dev_ctx.Init();
-  phi::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());
+  phi::Copy(
+      dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
 
   // 3. check result
   for (int64_t i = 0; i < dense_src->numel(); i++) {
diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc
index 8e825b7790111ac6b4a11a3d889f7b148dab9db4..8c2c8642ab9005472b74086e70457940b35f8619 100644
--- a/paddle/phi/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc
@@ -39,7 +39,7 @@ TEST(DEV_API, empty) {
   dev_ctx.Init();
 
   // 2. test API
-  auto out = phi::Empty<float>(dev_ctx, {3, 2}, phi::DataType::INT32);
+  auto out = phi::Empty<int>(dev_ctx, {3, 2});
 
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);
@@ -87,7 +87,7 @@ TEST(DEV_API, full) {
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
   dev_ctx.Init();
-  auto out = phi::Full<float>(dev_ctx, {3, 2}, val, phi::DataType::FLOAT32);
+  auto out = phi::Full<float>(dev_ctx, {3, 2}, val);
 
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..576015143704b86957073bcf3f06b381e4b61592
--- /dev/null
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -0,0 +1,471 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace tests {
+
+std::vector<int> flatten(const std::vector<std::vector<int>>& in) {
+  std::vector<int> out;
+  if (in.size() == 0) return out;
+  const int cols = in[0].size();
+  out.resize(in.size() * cols);
+  for (uint64_t i = 0; i < in.size(); i++) {
+    memcpy(&out[i * cols], in[i].data(), cols * sizeof(int));
+  }
+  return out;
+}
+
+template <typename T1, typename T2>
+std::vector<T2> cast(const std::vector<T1>& in) {
+  std::vector<T2> out(in.size());
+  for (uint64_t i = 0; i < in.size(); i++) {
+    out[i] = static_cast<T2>(in[i]);
+  }
+  return out;
+}
+
+template <typename T>
+void TestConv3dBase(const std::vector<int>& indices,
+                    const std::vector<T>& features,
+                    const DDim& x_dims,
+                    const std::vector<T>& kernel,
+                    const DDim& kernel_dims,
+                    const std::vector<int>& correct_out_indices,
+                    const std::vector<T>& correct_out_features,
+                    const DDim& correct_out_dims,
+                    const int non_zero_num,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& dilations,
+                    const float diff = 1e-3) {
+  phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx_cpu.Init();
+
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+
+  DenseTensor indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  dev_ctx_cpu.Alloc(&indices_tensor,
+                    indices_tensor.dtype(),
+                    sizeof(int) * indices_tensor.numel());
+  memcpy(
+      indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
+  DenseTensor features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  dev_ctx_cpu.Alloc(&features_tensor,
+                    features_tensor.dtype(),
+                    features_tensor.numel() * sizeof(T));
+  memcpy(
+      features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
+
+  SparseCooTensor x_tensor(indices_tensor, features_tensor, x_dims);
+
+  DenseTensor kernel_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      kernel_dims,
+                      DataLayout::NHWC));
+  dev_ctx_cpu.Alloc(
+      &kernel_tensor, kernel_tensor.dtype(), kernel_tensor.numel() * sizeof(T));
+  memcpy(kernel_tensor.data<T>(), kernel.data(), kernel.size() * sizeof(T));
+
+  if (!std::is_same<T, phi::dtype::float16>::value) {
+    DenseTensor rulebook = phi::Empty<int, phi::CPUContext>(dev_ctx_cpu);
+    SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
+                                            x_tensor,
+                                            kernel_tensor,
+                                            paddings,
+                                            dilations,
+                                            strides,
+                                            1,
+                                            &rulebook);
+
+    ASSERT_EQ(correct_out_dims.size(), out.dims().size());
+    for (int i = 0; i < correct_out_dims.size(); i++) {
+      ASSERT_EQ(correct_out_dims[i], out.dims()[i]);
+    }
+    ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz());
+
+    int cmp_indices = memcmp(correct_out_indices.data(),
+                             out.non_zero_indices().data<int>(),
+                             correct_out_indices.size() * sizeof(int));
+    ASSERT_EQ(cmp_indices, 0);
+
+    for (uint64_t i = 0; i < correct_out_features.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(
+          correct_out_features[i] - out.non_zero_elements().data<T>()[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  }
+}
+
+void TestConv3d(const std::vector<int>& indices,
+                const std::vector<float>& features,
+                const DDim& x_dims,
+                const std::vector<float>& kernel,
+                const DDim& kernel_dims,
+                const std::vector<int>& correct_out_indices,
+                const std::vector<float>& correct_out_features,
+                const DDim& correct_out_dims,
+                const int non_zero_num,
+                const std::vector<int>& paddings,
+                const std::vector<int>& strides,
+                const std::vector<int>& dilations) {
+  // test float
+  TestConv3dBase<float>(indices,
+                        features,
+                        x_dims,
+                        kernel,
+                        kernel_dims,
+                        correct_out_indices,
+                        correct_out_features,
+                        correct_out_dims,
+                        non_zero_num,
+                        paddings,
+                        strides,
+                        dilations);
+  // test double
+  TestConv3dBase<double>(indices,
+                         cast<float, double>(features),
+                         x_dims,
+                         cast<float, double>(kernel),
+                         kernel_dims,
+                         correct_out_indices,
+                         cast<float, double>(correct_out_features),
+                         correct_out_dims,
+                         non_zero_num,
+                         paddings,
+                         strides,
+                         dilations);
+}
+
+TEST(DEV_API, sparse_conv3d) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 4, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 2, 2, 2, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 4;
+  std::vector<std::vector<int>> indices = {
+      {0, 0, 0, 0}, {0, 2, 0, 2}, {3, 2, 2, 3}, {3, 2, 3, 2}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {-0.2883, 0.0287, 0.2864, -0.0992};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.4721, 0.2292, 0.9751, 0.8616, 0.5784, 0.9178, 0.8727, 0.1659, 0.4455,
+
+      0.0189, 0.4646, 0.4472, 0.1991, 0.8968, 0.3717, 0.0051, 0.6963, 0.2690,
+
+      0.7473, 0.5403, 0.5391, 0.0796, 0.4734, 0.9097, 0.1712, 0.6237, 0.8837};
+
+  std::vector<std::vector<int>> out_indices = {{0, 0, 0, 0, 0, 0, 0, 0},
+                                               {0, 0, 0, 0, 1, 1, 1, 1},
+                                               {0, 0, 1, 1, 0, 0, 1, 1},
+                                               {0, 1, 0, 1, 0, 1, 0, 1}};
+  std::vector<int> out_indices_flatten = flatten(out_indices);
+
+  std::vector<float> out_features = {
+      0.0254, 0.1455, -0.0615, 0.0862, 0.0077, 0.0200, -0.0160, -0.0433};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv3d_batch) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {2, 4, 4, 4, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {2, 2, 2, 2, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 8;
+  std::vector<std::vector<int>> indices = {{0, 0, 0, 0, 1, 1, 1, 1},
+                                           {0, 2, 0, 2, 0, 2, 0, 2},
+                                           {3, 2, 2, 3, 3, 2, 2, 3},
+                                           {3, 2, 3, 2, 3, 2, 3, 2}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {
+      -0.2883, 0.0287, 0.2864, -0.0992, -0.2883, 0.0287, 0.2864, -0.0992};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.4721, 0.2292, 0.9751, 0.8616, 0.5784, 0.9178, 0.8727, 0.1659, 0.4455,
+
+      0.0189, 0.4646, 0.4472, 0.1991, 0.8968, 0.3717, 0.0051, 0.6963, 0.2690,
+
+      0.7473, 0.5403, 0.5391, 0.0796, 0.4734, 0.9097, 0.1712, 0.6237, 0.8837};
+
+  std::vector<std::vector<int>> out_indices = {
+      {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1},
+      {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1},
+      {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1},
+      {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}};
+  std::vector<int> out_indices_flatten = flatten(out_indices);
+
+  std::vector<float> out_features = {0.0254,
+                                     0.1455,
+                                     -0.0615,
+                                     0.0862,
+                                     0.0077,
+                                     0.0200,
+                                     -0.0160,
+                                     -0.0433,
+                                     0.0254,
+                                     0.1455,
+                                     -0.0615,
+                                     0.0862,
+                                     0.0077,
+                                     0.0200,
+                                     -0.0160,
+                                     -0.0433};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv3d_stride) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 4, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 1, 1, 1, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {2, 2, 2};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<std::vector<int>> indices = {
+      {0, 0, 0}, {0, 2, 0}, {3, 2, 2}, {3, 2, 3}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {-0.28833008, 0.02873230, 0.28637695};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.45043945, 0.47216797, 0.22924805, 0.97509766, 0.86181641, 0.57861328,
+      0.91796875, 0.87255859, 0.16589355, 0.44555664, 0.01889038, 0.46459961,
+      0.44726562, 0.19909668, 0.89697266, 0.37158203, 0.00513077, 0.69628906,
+      0.26904297, 0.74707031, 0.54003906, 0.5390625,  0.07958984, 0.47338867,
+      0.90966797, 0.17126465, 0.62353516};
+
+  std::vector<std::vector<int>> out_indices = {{0, 0, 0, 0}};
+  std::vector<int> out_indices_flatten = flatten(out_indices);
+
+  std::vector<float> out_features = {0.01791};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv3d_dilation) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 6, 6, 6, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 2, 2, 2, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {2, 2, 2};
+
+  const int non_zero_num = 3;
+  std::vector<std::vector<int>> indices = {
+      {0, 0, 0}, {2, 3, 3}, {2, 3, 3}, {5, 2, 0}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {-0.78710938, -0.64746094, 0.98828125};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.20617676, 0.99365234, 0.16760254, 0.30639648, 0.41479492, 0.75732422,
+      0.65625,    0.48535156, 0.72167969, 0.56005859, 0.5,        0.3581543,
+      0.20324707, 0.88769531, 0.81298828, 0.58398438, 0.30810547, 0.12634277,
+      0.70507812, 0.38720703, 0.34814453, 0.02690125, 0.80273438, 0.90625,
+      0.2277832,  0.4362793,  0.44482422};
+
+  std::vector<std::vector<int>> out_indices = {{0, 0, 0, 1, 0, 1, 1, 0}};
+  std::vector<int> out_indices_flatten = flatten(out_indices);
+
+  std::vector<float> out_features = {-0.64014, -0.37402};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv3d_padding) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 3, 3, 3, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 3, 3, 3, out_channels};
+  std::vector<int> paddings = {1, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 1;
+  std::vector<std::vector<int>> indices = {{0, 1, 0, 0}};
+  std::vector<int> indices_flatten = flatten(indices);
+
+  std::vector<float> features = {-0.79394531};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.34375,    0.22485352, 0.65820312, 0.75048828, 0.21411133, 0.17370605,
+      0.85546875, 0.53076172, 0.28833008, 0.71044922, 0.00659943, 0.45922852,
+      0.19372559, 0.64599609, 0.78808594, 0.49316406, 0.62646484, 0.40649414,
+      0.62744141, 0.5703125,  0.23144531, 0.50048828, 0.31835938, 0.90869141,
+      0.38208008, 0.60449219, 0.09075928};
+
+  std::vector<int> out_indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+      0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+
+  std::vector<float> out_features = {-0.25269,
+                                     -0.39746,
+                                     -0.45288,
+                                     -0.49805,
+                                     -0.5127,
+                                     -0.15381,
+                                     -0.00524,
+                                     -0.56396,
+                                     -0.17004,
+                                     -0.5957,
+                                     -0.17847,
+                                     -0.27295};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+TEST(DEV_API, sparse_conv2d) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 1, 5, 5, in_channels};
+  DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 1, 3, 3, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
+
+  std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
+  // 3*3*3=27
+  std::vector<float> kernel = {0.65820312,
+                               0.75048828,
+                               0.21411133,
+                               0.17370605,
+                               0.85546875,
+                               0.53076172,
+                               0.28833008,
+                               0.71044922,
+                               0.00659943};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
+
+  std::vector<float> out_features = {
+      -0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
+
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index a75ca633b05a8bf49af7842e1b96d356d5babdec..3e2ad0495f3ba85836dc08afa3f4fa4ed0b10afd 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -8,7 +8,7 @@ You may obtain a copy of the License at
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF NCHW KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
@@ -53,8 +53,8 @@ inline void CheckResult(
                          DenseTensorMeta(real_elements.dtype(),
                                          real_elements.dims(),
                                          real_elements.layout()));
-    phi::Copy(*dev_ctx_gpu, real_indices, true, &indices);
-    phi::Copy(*dev_ctx_gpu, real_elements, true, &elements);
+    phi::Copy(*dev_ctx_gpu, real_indices, indices.place(), true, &indices);
+    phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);
 
     int cmp_indices = memcmp(indices.data<IndicesT>(),
                              non_zero_indices.data(),
@@ -122,7 +122,7 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
       cuda_alloc.get(),
       DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));
 
-  phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
+  phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
   auto sparse_out =
       sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
   CheckResult<T, int64_t>(&dev_ctx_gpu,
@@ -327,9 +327,9 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
   phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
   phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
   phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  phi::Copy(dev_ctx_gpu, crows, true, &d_crows);
-  phi::Copy(dev_ctx_gpu, cols, true, &d_cols);
-  phi::Copy(dev_ctx_gpu, values, true, &d_values);
+  phi::Copy(dev_ctx_gpu, crows, d_crows.place(), true, &d_crows);
+  phi::Copy(dev_ctx_gpu, cols, d_cols.place(), true, &d_cols);
+  phi::Copy(dev_ctx_gpu, values, d_values.place(), true, &d_values);
   phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
   auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
   CheckResult<T, int64_t>(&dev_ctx_gpu,
@@ -406,9 +406,9 @@ inline void CheckCsrResult(
                          DenseTensorMeta(real_elements.dtype(),
                                          real_elements.dims(),
                                          real_elements.layout()));
-    phi::Copy(*dev_ctx_gpu, real_crows, true, &crows);
-    phi::Copy(*dev_ctx_gpu, real_cols, true, &cols);
-    phi::Copy(*dev_ctx_gpu, real_elements, true, &elements);
+    phi::Copy(*dev_ctx_gpu, real_crows, crows.place(), true, &crows);
+    phi::Copy(*dev_ctx_gpu, real_cols, cols.place(), true, &cols);
+    phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);
 
     int cmp_crows = memcmp(crows.data<IndicesT>(),
                            non_zero_crows.data(),
@@ -500,8 +500,8 @@ void TestCooToCsr(const DDim& dense_dims,
   dev_ctx_gpu.PartialInitWithAllocator();
   phi::DenseTensor d_indices(cuda_alloc.get(), indices_meta);
   phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  phi::Copy(dev_ctx_gpu, indices, true, &d_indices);
-  phi::Copy(dev_ctx_gpu, values, true, &d_values);
+  phi::Copy(dev_ctx_gpu, indices, phi::GPUPlace(), true, &d_indices);
+  phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
   phi::SparseCooTensor d_coo(d_indices, d_values, dense_dims);
   auto cuda_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_gpu, d_coo);
   CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
@@ -593,7 +593,7 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
           .GetAllocator(phi::CPUPlace())
           .get());
   dev_ctx_gpu.PartialInitWithAllocator();
-  phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
+  phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
   auto sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_gpu, d_dense_x);
 
   CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
@@ -720,8 +720,10 @@ void TestSparseCooToDense(const DDim& dense_dims,
   dev_ctx_gpu.PartialInitWithAllocator();
   DenseTensor d_dense_indices(cuda_alloc.get(), dense_indices.meta());
   DenseTensor d_dense_elements(cuda_alloc.get(), dense_elements.meta());
-  phi::Copy(dev_ctx_gpu, dense_indices, true, &d_dense_indices);
-  phi::Copy(dev_ctx_gpu, dense_elements, true, &d_dense_elements);
+  phi::Copy(
+      dev_ctx_gpu, dense_indices, phi::GPUPlace(), true, &d_dense_indices);
+  phi::Copy(
+      dev_ctx_gpu, dense_elements, phi::GPUPlace(), true, &d_dense_elements);
   SparseCooTensor coo_cuda(d_dense_indices, d_dense_elements, dense_dims);
   auto dense_out_cuda = sparse::SparseCooToDense<T>(dev_ctx_gpu, coo_cuda);
 
@@ -729,7 +731,8 @@ void TestSparseCooToDense(const DDim& dense_dims,
                           DenseTensorMeta(dense_out_cuda.dtype(),
                                           dense_out_cuda.dims(),
                                           dense_out_cuda.layout()));
-  phi::Copy(dev_ctx_gpu, dense_out_cuda, true, &h_dense_out);
+  phi::Copy(
+      dev_ctx_gpu, dense_out_cuda, h_dense_out.place(), true, &h_dense_out);
   int cmp_cuda = memcmp(
       &dense_data[0], h_dense_out.data<T>(), sizeof(T) * dense_data.size());
   ASSERT_EQ(cmp_cuda, 0);
@@ -858,13 +861,13 @@ void TestSparseCsrToDense(const DDim& dense_dims,
   phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
   phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
   phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  phi::Copy(dev_ctx_gpu, crows, true, &d_crows);
-  phi::Copy(dev_ctx_gpu, cols, true, &d_cols);
-  phi::Copy(dev_ctx_gpu, values, true, &d_values);
+  phi::Copy(dev_ctx_gpu, crows, phi::GPUPlace(), true, &d_crows);
+  phi::Copy(dev_ctx_gpu, cols, phi::GPUPlace(), true, &d_cols);
+  phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
   phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
   auto cuda_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_gpu, d_csr);
   phi::DenseTensor h_out(alloc.get(), cpu_sparse_out.meta());
-  phi::Copy(dev_ctx_gpu, cuda_sparse_out, true, &h_out);
+  phi::Copy(dev_ctx_gpu, cuda_sparse_out, phi::CPUPlace(), true, &h_out);
   int cmp_cuda =
       memcmp(h_out.data<T>(), dense_data.data(), sizeof(T) * dense_data.size());
   ASSERT_EQ(cmp_cuda, 0);
diff --git a/paddle/phi/tests/kernels/test_split_dev_api.cc b/paddle/phi/tests/kernels/test_split_dev_api.cc
index e6d6263128ec9ac91b1c65e98c476c49bdd30211..d5160933c1fa0374d8e31e52ec9d646e164025cb 100644
--- a/paddle/phi/tests/kernels/test_split_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_split_dev_api.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/split_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/api/include/manual_api.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/tests/ops_signature/CMakeLists.txt b/paddle/phi/tests/ops/CMakeLists.txt
similarity index 100%
rename from paddle/phi/tests/ops_signature/CMakeLists.txt
rename to paddle/phi/tests/ops/CMakeLists.txt
diff --git a/paddle/phi/tests/ops_signature/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
similarity index 98%
rename from paddle/phi/tests/ops_signature/test_op_signature.cc
rename to paddle/phi/tests/ops/test_op_signature.cc
index 203517c75069db9df7a6d3329e19fcaaa39eee93..a6c9a27de7dc5a3a9f09d4c336fe9b50e4d453a5 100644
--- a/paddle/phi/tests/ops_signature/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/tests/ops_signature/test_op_signature.h"
+#include "paddle/phi/tests/ops/test_op_signature.h"
 
 #include <gtest/gtest.h>
 #include <memory>
diff --git a/paddle/phi/tests/ops_signature/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
similarity index 100%
rename from paddle/phi/tests/ops_signature/test_op_signature.h
rename to paddle/phi/tests/ops/test_op_signature.h
diff --git a/paddle/phi/tools/CMakeLists.txt b/paddle/phi/tools/CMakeLists.txt
index bc690d3a9f1932e2fa1af97b6144c08aa957cc11..5693a46d97721934ec051702740db0684e6643a8 100644
--- a/paddle/phi/tools/CMakeLists.txt
+++ b/paddle/phi/tools/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_executable(print_pten_kernels print_pten_kernels.cc)
-target_link_libraries(print_pten_kernels pten pten_api_utils)
+target_link_libraries(print_pten_kernels phi phi_api_utils)
 if(WIN32)
     target_link_libraries(print_pten_kernels shlwapi.lib)
 endif()
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7f2ad893f67a34f2bd4772614ff8f94f33f3bdb8..4d7451f435271b4aaca3010e643ddcb5fbb28191 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -713,10 +713,159 @@ EOF
     fi
 }
 
+function run_linux_cpu_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    pip install hypothesis
+    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    ut_total_startTime_s=`date +%s`
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit tests ...
+    ========================================
+EOF
+set -x
+        export TEST_NUM_PERCENT_CASES=0.15
+        bash $PADDLE_ROOT/tools/check_added_ut.sh
+        if [ -a "$PADDLE_ROOT/duplicate_ut" ];then
+            duplicate_uts=$(cat $PADDLE_ROOT/duplicate_ut|sed -e 's/\r//g')
+            if [[ "$duplicate_uts" != "" ]];then
+                set +x
+                echo "========================================"
+                echo "The new unit test has the same name as the existing unit test"
+                cat "$PADDLE_ROOT/duplicate_ut"
+                echo "========================================"
+                exit 102;
+                set -x
+            fi
+        fi
+        if [ -a "$PADDLE_ROOT/added_ut" ];then
+            added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
+            ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            ctest -R "(${added_uts})" -L "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error_1=$?
+            if [ "$added_ut_error" != 0 ] && [ "$added_ut_error_1" != 0 ];then
+                echo "========================================"
+                echo "Added UT should not exceed 15 seconds"
+                echo "========================================"
+                exit 8;
+            fi
+        fi
+set +x
+        EXIT_CODE=0;
+        
+        tmpfile_rand=`date +%s%N`
+        tmpfile=$tmp_dir/$tmpfile_rand
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut' # indicate whether the case was in quickly disable list 
+        if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
+            nightly_label="NIGHTLY_LABEL"
+        else
+            nightly_label="RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY"
+            echo "========================================="
+            echo "Unittests with nightly labels  are only run at night"
+            echo "========================================="
+        fi
+        get_precision_ut_mac
+        ut_actual_total_startTime_s=`date +%s`
+        if [[ "$on_precision" == "0" ]];then
+            ctest -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+        else
+            ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            tmpfile_rand=`date +%s%N`
+            tmpfile=$tmp_dir/$tmpfile_rand
+            ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+        fi
+
+        collect_failed_tests
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            EXIT_CODE=1
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "2" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"    
+                        retry_unittests_regular=''         
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                if [[ "$retry_unittests_regular" == "" ]];then
+                                    retry_unittests_regular="^$line$"
+                                else
+                                    retry_unittests_regular="$retry_unittests_regular|^$line$"
+                                fi
+                            done
+                        failed_test_lists=''
+                        ctest -R "$retry_unittests_regular" --output-on-failure -j 2 | tee $tmpfile
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        exec_times=$[$exec_times+1]
+                    else 
+                        break
+                    fi 
+                done
+            retry_unittests_record="$retry_unittests_record$failed_test_lists"
+        fi
+        rerun_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        ut_actual_total_endTime_s=`date +%s`
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s"
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi
+}
 function get_precision_ut_mac() {
     on_precision=0
     UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
-    precison_cases=""
+    precision_cases=""
     if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
         python3.7 $PADDLE_ROOT/tools/get_pr_ut.py
         if [[ -f "ut_list" ]]; then
@@ -2443,22 +2592,6 @@ function reuse_so_cache() {
     fi
 }
 
-function find_temporary_files() {
-    set +x
-    jsonData=`curl \
-            -H "Authorization: token ${GITHUB_API_TOKEN}"\
-            -H "Accept: application/vnd.github.v3+json" \
-            https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/files`
-    
-    result=`echo ${jsonData}|python ${PADDLE_ROOT}/tools/check_file_suffix.py`
-    
-    if [ ${#result} -gt 0 ]
-    then
-	echo ${result}
-	exit 65
-    fi
-}
-
 function trt_convert_test() {
     set +e
     cd ${PADDLE_ROOT}
@@ -2489,10 +2622,19 @@ function build_pr_and_develop() {
         rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
         rm -rf ${PADDLE_ROOT}/build/third_party
     fi
-    git checkout -b develop_base_pr upstream/$BRANCH
-    cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
-    generate_api_spec "$1" "DEV"
-    mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
+
+    git fetch upstream develop
+    dev_commit=`git log -1|head -1|awk '{print $2}'`
+    dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl"
+    url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'`
+    if [ "$url_return" == '200' ];then
+        mkdir ${PADDLE_ROOT}/build/dev_whl && wget -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
+    else
+        git checkout -b develop_base_pr upstream/$BRANCH
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        generate_api_spec "$1" "DEV"
+        mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
+    fi
 }
 
 function build_develop() {
@@ -2520,7 +2662,6 @@ function main() {
         set +e
         check_style_info=$(check_style)
         check_style_code=$?
-        find_temporary_files
         generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         check_sequence_op_unittest
@@ -2540,7 +2681,6 @@ function main() {
         ;;
       build_and_check_cpu)
         set +e
-        find_temporary_files
         generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         check_sequence_op_unittest
@@ -2691,9 +2831,11 @@ function main() {
         cmake_gen ${PYTHON_ABI:-""}
         build_mac
         ;;
-      cicheck_py35)
+      cicheck_py37)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
-        parallel_test
+        run_linux_cpu_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
+        
+        #parallel_test
         ;;
       cpu_cicheck_py35)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h
index d2a9a3f11ef3ca0023dabcba2051b30da9c5c858..eec5f32be722617f87a85913be4b55e90d2929e7 100644
--- a/paddle/utils/optional.h
+++ b/paddle/utils/optional.h
@@ -20,6 +20,7 @@
 #pragma once
 
 #include <algorithm>
+#include <cassert>
 #include <functional>
 #include <new>
 #include <type_traits>
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 5a22d22151a1cd12b68fc3672faec965f399d5fd..26740dfd0f6dbbce655079566dfe046139661b02 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -14,6 +14,8 @@
 
 import paddle
 from paddle.fluid.framework import dygraph_only
+from paddle.fluid.dygraph.amp.auto_cast import amp_state
+from paddle.amp.auto_cast import auto_cast
 from paddle.fluid import core
 __all__ = []
 
@@ -46,6 +48,7 @@ class PyLayerContext(object):
 
     def __init__(self):
         self.container = None
+        self._amp_state = amp_state()
 
     def save_for_backward(self, *tensors):
         """
@@ -178,6 +181,13 @@ class PyLayerBackward(PyLayerContext):
     def backward(self, *args, **kwargs):
         with paddle.fluid.dygraph.guard():
             with paddle.fluid.dygraph.no_grad():
+                if self._amp_state and 'enable' in self._amp_state and self._amp_state[
+                        'enable']:
+                    with auto_cast(**args[0]._amp_state):
+                        return self._forward_cls.backward(*args, **kwargs)
+                else:
+
+                    return self._forward_cls.backward(*args, **kwargs)
                 return self._forward_cls.backward(*args, **kwargs)
 
 
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 19306d3da9916898059bf597a1577ea5aeb98822..0d985a523251754ff4335d76cd4ced7ef3f42f49 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -156,6 +156,16 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         )
         base_group.add_argument("--selected_npus", dest="npus")
 
+    if fluid.core.is_compiled_with_mlu():
+        base_group.add_argument(
+            "--mlus",
+            type=str,
+            default=None,
+            help="It's for mlu training. For example: "
+            "--mlus=\"0,1,2,3\" will launch four training processes each bound to one mlu."
+        )
+        base_group.add_argument("--selected_mlus", dest="mlus")
+
     base_group.add_argument(
         "training_script",
         type=str,
@@ -429,6 +439,8 @@ def infer_backend(args):
         args.backend = 'unknown'
     elif fluid.core.is_compiled_with_xpu():
         args.backend = 'bkcl'
+    elif fluid.core.is_compiled_with_mlu():
+        args.backend = 'cncl'
     else:
         args.backend = 'gloo'
 
@@ -472,6 +484,8 @@ def which_distributed_mode(args):
         accelerators = fluid.core.get_npu_device_count()
     elif fluid.core.is_compiled_with_xpu():
         accelerators = fluid.core.get_xpu_device_count()
+    elif fluid.core.is_compiled_with_mlu():
+        accelerators = fluid.core.get_mlu_device_count()
     else:
         accelerators = 0
 
@@ -490,17 +504,18 @@ def which_distributed_mode(args):
         return DistributeMode.COLLECTIVE
     else:
         if not fluid.core.is_compiled_with_cuda(
-        ) and not fluid.core.is_compiled_with_xpu():
+        ) and not fluid.core.is_compiled_with_xpu(
+        ) and not fluid.core.is_compiled_with_mlu():
             if args.servers:
                 logger.warning(
-                    "Not found distinct arguments and not compiled with cuda or xpu or npu. "
+                    "Not found distinct arguments and not compiled with cuda or xpu or npu or mlu. "
                     "But found args.servers not empty, default use ps mode")
                 return DistributeMode.PS
             else:
                 return DistributeMode.COLLECTIVE
         else:
             logger.warning(
-                "Not found distinct arguments and compiled with cuda or xpu or npu. "
+                "Not found distinct arguments and compiled with cuda or xpu or npu or mlu. "
                 "Default use collective mode")
             return DistributeMode.COLLECTIVE
 
@@ -536,6 +551,10 @@ def launch():
         
         - ``--selected_xpus``: xpus aliases, recommend to use ``--xpus``.
 
+        - ``--mlus``: It's for mlu training. e.g., ``--mlus=0,1,2,3`` will launch four training processes each bound to one mlu.
+
+        - ``--selected_mlus``: mlus aliases, recommend to use ``--mlus``.
+
         - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
 
         - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
@@ -688,7 +707,7 @@ def launch():
         check_backend(args.backend)
         distribute_mode = DistributeMode.COLLECTIVE
 
-    #assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter', 'unknown']
+    #assert args.backend in ['gloo', 'nccl', 'bkcl', 'cncl', 'heter', 'unknown']
 
     if args.backend == 'gloo':
         logger.warning("launch start with CPUONLY mode")
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index c20c209d6017145981aec23ca14e526a47845c59..2dec58c75385320e936b62f50e55b23c2f180485 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -57,6 +57,7 @@ class DeviceMode():
     XPU = 2
     ASCEND_NPU = 3
     UNKNOWN = 3
+    MLU = 4
 
 
 class Cluster(object):
@@ -287,7 +288,7 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
         ), "current trainer_endpoints size should be greater equal than acclerators size."
         for i in range(len(devices_per_proc)):
             trainer = Trainer()
-            if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU:
+            if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU or device_mode == DeviceMode.MLU:
                 if isinstance(devices_per_proc[i], (list, tuple)):
                     trainer.accelerators.extend(devices_per_proc[i])
                     pod.accelerators.extend(devices_per_proc[i])
@@ -530,6 +531,9 @@ def start_local_trainers(cluster,
                  accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU:
             proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators])
+        elif len(t.accelerators) > 0 and pod.device_mode == DeviceMode.MLU:
+            proc_env["FLAGS_selected_mlus"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
 
         if len(t.accelerators) > 0:
             proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
@@ -735,6 +739,35 @@ def get_npus(npus):
     return res_npus
 
 
+def get_mlus(mlus):
+    if mlus is None:
+        mlus_num = fluid.core.get_mlu_device_count()
+        res_mlus = [str(x) for x in range(0, mlus_num)]
+    else:
+        mlu_visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+        if mlu_visible_devices is None or mlu_visible_devices == "":
+            res_mlus = [x.strip() for x in mlus.split(',')]
+        else:
+            # change mlus into relative values
+            # e.g. MLU_VISIBLE_DEVICES=4,5,6,7; args.mlus=4,5,6,7;
+            # therefore mlus=0,1,2,3
+            mlu_visible_devices_list = mlu_visible_devices.split(',')
+            for x in mlus.split(','):
+                assert x in mlu_visible_devices_list, "Can't find "\
+                    "your mlus %s in MLU_VISIBLE_DEVICES[%s]."\
+                    % (x, mlu_visible_devices)
+            res_mlus = [
+                mlu_visible_devices_list.index(x.strip())
+                for x in mlus.split(',')
+            ]
+            logger.info("Change selected_mlus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "MLU_VISIBLE_DEVICES:{}".format(
+                            mlus, res_mlus, mlu_visible_devices_list))
+
+    return res_mlus
+
+
 def get_device_mode(backend):
     if backend == 'heter':
         if fluid.core.is_compiled_with_cuda() and \
@@ -763,6 +796,10 @@ def get_device_mode(backend):
         print("launch train in XPU mode")
         return DeviceMode.XPU
 
+    if backend == 'cncl' and fluid.core.get_mlu_device_count() > 0:
+        print("launch train in MLU mode")
+        return DeviceMode.MLU
+
     if backend == 'gloo':
         print("launch train in CPU mode")
         return DeviceMode.CPU
@@ -812,6 +849,18 @@ def get_device_proc_info(args):
             ]
         else:
             devices_per_proc = xpus
+    elif device_mode == DeviceMode.MLU:
+        mlus = get_mlus(args.mlus)
+        if args.nproc_per_node is not None:
+            assert (len(mlus) % int(args.nproc_per_node)) ==0, \
+                "mlus' number:{} mod args.nproc_per_node:{} must == 0".format(len(mlus), args.nproc_per_node)
+
+            n = int(len(mlus) / int(args.nproc_per_node))
+            devices_per_proc = [
+                mlus[i:i + n] for i in six.moves.range(0, len(mlus), n)
+            ]
+        else:
+            devices_per_proc = mlus
     elif device_mode == DeviceMode.CPU:
         if hasattr(args, "paddle_cpuonly") and args.nproc_per_node is None:
             #NOTE (xiongkun03) set it to cpu core number
@@ -1719,7 +1768,7 @@ class ParameterServerLauncher(object):
 
 
 def check_backend(backend):
-    if backend not in ['nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter']:
+    if backend not in ['nccl', 'gloo', 'bkcl', 'cncl', 'auto', 'hccl', 'heter']:
         raise ValueError("paddle.distributed initialize error, "
                          "backend argument can only be one of "
                          "'nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter' "
@@ -1743,6 +1792,12 @@ def check_backend(backend):
             "your paddle is not compiled with npu but you assign 'hccl' as backend."
         )
 
+    if backend == 'cncl' and not fluid.core.is_compiled_with_mlu():
+        raise ValueError(
+            "paddle.distributed initialize error, "
+            "your paddle is not compiled with mlu but you assign 'cncl' as backend."
+        )
+
 
 def block_windows_and_macos(backend):
     if backend != 'gloo': return
@@ -1766,4 +1821,7 @@ def get_backend_by_compile_flag():
     if fluid.core.is_compiled_with_npu():
         return 'hccl'
 
+    if fluid.core.is_compiled_with_mlu():
+        return 'cncl'
+
     return 'gloo'
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 7e3dfde5d4f6758bba028c2ad76763e9ad732750..b8a696057e7800d9f7d3298762945462861e6b4b 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -502,9 +502,6 @@ class IpuStrategy(object):
     """
     Help users precisely control the graph building in :code:`paddle.static.IpuCompiledProgram` .
 
-    Args:
-        None.
-        
     Returns:
         The IpuStrategy instance.
 
@@ -517,23 +514,36 @@ class IpuStrategy(object):
             import paddle.static as static
 
             paddle.enable_static()
+
             ipu_strategy = static.IpuStrategy()
     """
 
     def __init__(self):
         if core.is_compiled_with_ipu():
             self._ipu_strategy = core.IpuStrategy()
+            default_options = {
+                'location_optimizer': {
+                    'on_chip': 0,
+                    'use_replicated_tensor_sharding': 1,
+                },  # set optimizer location
+                'accumulation_and_replication_reduction_type':
+                1,  # popart::ReductionType::Mean
+                'mean_accumulation_and_replication_reduction_strategy':
+                1,  # popart::MeanReductionStrategy::Post
+            }
+            self._ipu_strategy.set_options(default_options)
+            self.has_custom_ops = False
+            self.custom_op_names = []
         else:
             raise RuntimeError(
                 "Can not use IpuStrategy in non IPU compiled environment, please re-compile with WITH_IPU=ON."
             )
 
-    def SetGraphConfig(self,
-                       num_ipus=1,
-                       is_training=True,
-                       batch_size=1,
-                       enable_manual_shard=False,
-                       need_avg_shard=False):
+    def set_graph_config(self,
+                         num_ipus=1,
+                         is_training=True,
+                         batch_size=1,
+                         enable_manual_shard=False):
         """
         Set graph configuration to the IpuStrategy instance.
 
@@ -544,8 +554,6 @@ class IpuStrategy(object):
                 if the batch-size in the graph is dynamic. Default 1, which means the batch-size would be set 1, if the batch-size is dynamice.
             enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True. 
                 Default False, which means disabled.    
-            need_avg_shard (bool, optional): Enable auto graph sharding or not. Only if num_ipus > 1 and enable_manual_shard=True, need_avg_shard is able to be set Trues. 
-                Default False, which means disabled.
             
         Returns:
             None.
@@ -559,32 +567,29 @@ class IpuStrategy(object):
                 import paddle.static as static
 
                 paddle.enable_static()
+
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(num_ipus=1,
+                ipu_strategy.set_graph_config(num_ipus=1,
                                             is_training=True,
                                             batch_size=1,
-                                            enable_manual_shard=False,
-                                            need_avg_shard=False)
+                                            enable_manual_shard=False)
         """
-
-        self._ipu_strategy.num_ipus = num_ipus
-        self._ipu_strategy.is_training = is_training
-        self._ipu_strategy.batch_size = batch_size
-        self._ipu_strategy.enable_manual_shard = enable_manual_shard
-        if self._ipu_strategy.num_ipus == 1 and self._ipu_strategy.enable_manual_shard:
+        if num_ipus == 1 and enable_manual_shard:
             raise RuntimeError(
                 "Only if num_ipus > 1, enable_manual_shard is able to be set True."
             )
-        self._ipu_strategy.need_avg_shard = need_avg_shard
-        if self._ipu_strategy.enable_manual_shard != True and self._ipu_strategy.need_avg_shard:
-            raise RuntimeError(
-                "Only if enable_manual_shard=True, need_avg_shard is able to be set True."
-            )
-
-    def SetPipeliningConfig(self,
-                            enable_pipelining=False,
-                            batches_per_step=1,
-                            accumulationFactor=1):
+        options = {
+            'num_ipus': num_ipus,
+            'is_training': is_training,
+            'micro_batch_size': batch_size,
+            'enable_manual_shard': enable_manual_shard,
+        }
+        self.set_options(options)
+
+    def set_pipelining_config(self,
+                              enable_pipelining=False,
+                              batches_per_step=1,
+                              accumulation_factor=1):
         """
         Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance.
 
@@ -593,7 +598,7 @@ class IpuStrategy(object):
                 Default False, which means disabled.
             batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1.
                 Default 1, which means no data pipelining.
-            accumulationFactor (int, optional): Specify the number of micro-batches to accumulate 
+            accumulation_factor (int, optional): Specify the number of micro-batches to accumulate 
                 before applying the varUpdate. Default 1, which means disable the accumulation.
         
         Returns:
@@ -610,23 +615,23 @@ class IpuStrategy(object):
                 paddle.enable_static()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetPipeliningConfig(enable_pipelining=False,
-                                                 batches_per_step=1,
-                                                 accumulationFactor=1)
+                ipu_strategy.set_pipelining_config(enable_pipelining=False,
+                                                    batches_per_step=1,
+                                                    accumulation_factor=1)
         """
-        self._ipu_strategy.enable_pipelining = enable_pipelining
-        if self._ipu_strategy.enable_manual_shard != True and self._ipu_strategy.enable_pipelining:
+        enable_manual_shard = self.get_option('enable_manual_shard')
+        if not enable_manual_shard and enable_pipelining:
             raise RuntimeError(
                 "Only if enable_manual_shard=True, enable_pipelining is able to be set True."
             )
-        self._ipu_strategy.batches_per_step = batches_per_step
-        if self._ipu_strategy.enable_pipelining != True and self._ipu_strategy.batches_per_step > 1:
-            raise RuntimeError(
-                "Only if enable_pipelining=True, batches_per_step is able to be set > 1."
-            )
-        self._ipu_strategy.accumulationFactor = accumulationFactor
-
-    def SetHalfConfig(self, enable_fp16=False):
+        options = {
+            'enable_pipelining': enable_pipelining,
+            'batches_per_step': batches_per_step,
+            'accumulation_factor': accumulation_factor,
+        }
+        self.set_options(options)
+
+    def set_precision_config(self, enable_fp16=False):
         """
         Set half computation configuration to the IpuStrategy instance. Used to optimize the performance.
 
@@ -647,73 +652,135 @@ class IpuStrategy(object):
                 paddle.enable_static()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetHalfConfig(enable_fp16=False)
+                ipu_strategy.set_precision_config(enable_fp16=False)
+        """
+        options = {'enable_fp16': enable_fp16, }
+        self.set_options(options)
+
+    def add_custom_op(self,
+                      paddle_op,
+                      popart_op=None,
+                      domain='custom.ops',
+                      version=1):
         """
+        Add a mapping to use popart custom ops running on the IPU.
 
-        self._ipu_strategy.enable_fp16 = enable_fp16
+        Args:
+            paddle_op(str): the name of custom op in paddle.
 
-    @property
-    def num_ipus(self):
-        """
-        Get the number of IPU devices from IpuStrategy instance.
-        """
-        return self._ipu_strategy.num_ipus
+            popart_op(str): the name of custom op in popart.
 
-    @property
-    def is_training(self):
-        """
-        Get the boolean of training or inference from IpuStrategy instance.
-        """
-        return self._ipu_strategy.is_training
+            domain(str): domain name of custom op in popart.
 
-    @property
-    def batch_size(self):
+            version(int): version of custom op in popart.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                ipu_strategy.add_custom_op('paddle_relu', 'popart_relu')
         """
-        Get the batch_size used in dynamic batch_size graph from IpuStrategy instance.
+        if popart_op is None:
+            popart_op = paddle_op
+        custom_op = {
+            'paddle_op': paddle_op,
+            'popart_op': popart_op,
+            'domain': domain,
+            'version': version,
+        }
+        self.set_options({'custom_op': custom_op})
+        self.custom_op_names.append(paddle_op)
+        if not self.has_custom_ops:
+            self.has_custom_ops = True
+
+    def set_options(self, options):
         """
-        return self._ipu_strategy.batch_size
+        Set options from dict.
 
-    @property
-    def enable_manual_shard(self):
-        """
-        Get the boolean of enable manual shard or not from IpuStrategy instance.
+        Args:
+            options(dict): dict of options.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                options = {'num_ipus':1, 'enable_fp16': True}
+                ipu_strategy.set_options(options)
         """
-        return self._ipu_strategy.enable_manual_shard
+        self._ipu_strategy.set_options(options)
 
-    @property
-    def need_avg_shard(self):
+    def get_option(self, option):
         """
-        Get the boolean of need average shard or not from IpuStrategy instance.
+        Get option.
+
+        Args:
+            option(str): name of option.
+        
+        Returns:
+            option value.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                num_ipus = ipu_strategy.get_option('num_ipus')
         """
-        return self._ipu_strategy.need_avg_shard
+        return self._ipu_strategy.get_option(option)['value']
 
     @property
-    def enable_pipelining(self):
+    def num_ipus(self):
         """
-        Get the boolean of enable pipelining or not from IpuStrategy instance.
+        Get the number of IPU devices from IpuStrategy instance.
         """
-        return self._ipu_strategy.enable_pipelining
+        return self.get_option('num_ipus')
 
     @property
-    def batches_per_step(self):
+    def is_training(self):
         """
-        Get the number of batch_size per run in the pipelining mode from IpuStrategy instance.
+        Get the boolean of training or inference from IpuStrategy instance.
         """
-        return self._ipu_strategy.batches_per_step
+        return self.get_option('is_training')
 
     @property
-    def accumulationFactor(self):
+    def enable_pipelining(self):
         """
-        Get the number of micro-batches to accumulate before applying the varUpdate from IpuStrategy instance.
+        Get the boolean of enable pipelining or not from IpuStrategy instance.
         """
-        return self._ipu_strategy.accumulationFactor
+        return self.get_option('enable_pipelining')
 
     @property
     def enable_fp16(self):
         """
         Get the boolean of float16 mode or not from IpuStrategy instance.
         """
-        return self._ipu_strategy.enable_fp16
+        return self.get_option('enable_fp16')
 
 
 class IpuCompiledProgram(object):
@@ -750,9 +817,9 @@ class IpuCompiledProgram(object):
             main_prog = static.default_main_program()
             
             ipu_strategy = static.IpuStrategy()
-            ipu_strategy.SetGraphConfig(num_ipus=1, is_training=True, batch_size=1)
-            ipu_strategy.SetPipeliningConfig(enable_pipelining=False, batches_per_step=1, accumulationFactor=1)
-            ipu_strategy.SetHalfConfig(enable_fp16=False)
+            ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
+            ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+            ipu_strategy.set_precision_config(enable_fp16=False)
             
             ipu_compiled_program = static.IpuCompiledProgram(
                 main_prog,
@@ -766,14 +833,12 @@ class IpuCompiledProgram(object):
             )
 
         if program is None:
-            program = default_main_program()
+            program = framework.default_main_program()
 
         if not isinstance(program, framework.Program):
             raise TypeError(
                 "The type of program is wrong, expected Program, but got %s" %
                 type(program))
-        # import here to avoiding confused
-        import paddle
 
         self._program = program
         self._compiled = False
@@ -781,23 +846,21 @@ class IpuCompiledProgram(object):
         if scope is not None:
             self._scope = scope
         else:
+            # import here to avoiding confused
+            import paddle
             self._scope = paddle.static.global_scope()
 
         if ipu_strategy is not None:
-            self._ipu_strategy = ipu_strategy._ipu_strategy
+            self._ipu_strategy = ipu_strategy
         else:
-            self._ipu_strategy = core.IpuStrategy()
+            self._ipu_strategy = IpuStrategy()
 
-        self._backend = core.IpuBackend()
-        self._backend.set_scope(self._scope)
-        self._backend.set_ipu_strategy(self._ipu_strategy)
-        self._graph_passes = [
-            "optimizer_extract_pass", "optimizer_state_align_pass",
-            "forward_graph_extract_pass", "infer_shape_pass", "avg_shard_pass",
-            "popart_canonicalization_pass"
-        ]
-        global ipu_compiler_ref
-        ipu_compiler_ref = self
+        if ipu_strategy.has_custom_ops:
+            self._custom_op_names = set(ipu_strategy.custom_op_names)
+        else:
+            self._custom_op_names = ()
+
+        self._backend = core.IpuBackend.get_instance()
 
     def compile(self, feed_list, fetch_list):
         """
@@ -828,20 +891,23 @@ class IpuCompiledProgram(object):
                 main_prog = static.default_main_program()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(num_ipus=1, is_training=True, batch_size=1)
-                ipu_strategy.SetPipeliningConfig(enable_pipelining=False, batches_per_step=1, accumulationFactor=1)
-                ipu_strategy.SetHalfConfig(enable_fp16=False)
+                ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
+                ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+                ipu_strategy.set_precision_config(enable_fp16=False)
                 
                 program = static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile([a.name], [b.name])
         """
+        self._backend.set_scope(self._scope)
+        self._backend.set_ipu_strategy(self._ipu_strategy._ipu_strategy)
+
         # feed and fetch doesn't have corresponding popart op, so we rm both here
         global_block = self._program.global_block()
         need_to_remove_op_index = []
         for i, op in enumerate(global_block.ops):
             op.desc.set_is_target(False)
-            if op.type == "feed" or op.type == "fetch":
+            if op.type == 'feed' or op.type == 'fetch':
                 need_to_remove_op_index.append(i)
 
         for index in need_to_remove_op_index[::-1]:
@@ -854,26 +920,45 @@ class IpuCompiledProgram(object):
         self._program.desc.flush()
         self._graph = core.Graph(self._program.desc)
 
-        for pass_name in self._graph_passes:
-            graph_pass = core.get_pass(pass_name)
-            if pass_name == "infer_shape_pass":
-                graph_pass.set("feed_list", feed_list)
-            graph_pass.apply(self._graph)
-
-        ipu_inplace_pass = core.get_pass("ipu_inplace_pass")
-        ipu_inplace_pass.set("feed_list", feed_list)
-        ipu_inplace_pass.set("fetch_list", fetch_list)
-        ipu_inplace_pass.apply(self._graph)
-
-        ipu_graph_builder_pass = core.get_pass("ipu_graph_builder_pass")
-        ipu_graph_builder_pass.set("feed_list", feed_list)
-        ipu_graph_builder_pass.set("fetch_list", fetch_list)
-        ipu_graph_builder_pass.apply(self._graph)
-
-        ipu_runtime_replacer_pass = core.get_pass("ipu_runtime_replacer_pass")
-        ipu_runtime_replacer_pass.set("feed_list", feed_list)
-        ipu_runtime_replacer_pass.set("fetch_list", fetch_list)
-        ipu_runtime_replacer_pass.apply(self._graph)
+        if self._ipu_strategy.is_training:
+            passes = [
+                'optimizer_extract_pass',
+                'optimizer_state_align_pass',
+            ]
+            for pass_name in passes:
+                a_pass = core.get_pass(pass_name)
+                a_pass.apply(self._graph)
+
+        passes = [
+            'forward_graph_extract_pass',
+            'infer_shape_pass',
+            'avg_shard_pass',
+            'delete_scale_op_pass',
+        ]
+        for pass_name in passes:
+            a_pass = core.get_pass(pass_name)
+            if pass_name == 'infer_shape_pass':
+                a_pass.set('feed_list', feed_list)
+            a_pass.apply(self._graph)
+
+        a_pass = core.get_pass('popart_canonicalization_pass')
+        if self._custom_op_names:
+            a_pass.set('custom_ops', self._custom_op_names)
+        a_pass.apply(self._graph)
+
+        a_pass = core.get_pass("transfer_cast_op_pass")
+        a_pass.apply(self._graph)
+
+        passes = [
+            'ipu_inplace_pass',
+            'ipu_graph_builder_pass',
+            'ipu_runtime_replacer_pass',
+        ]
+        for pass_name in passes:
+            a_pass = core.get_pass(pass_name)
+            a_pass.set('feed_list', feed_list)
+            a_pass.set('fetch_list', fetch_list)
+            a_pass.apply(self._graph)
 
         convert_pass = core.get_pass('graph_to_program_pass')
         desc = core.ProgramDesc()
@@ -904,9 +989,3 @@ class IpuCompiledProgram(object):
             program.org_program = self._program
 
         return program
-
-    def clean(self):
-        self._backend.clear()
-
-    def __del__(self):
-        self.clean()
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 41a7d3d774793140e5942f936ba4538d728db207..f43a51063b00ac0439aacfbf46ff593e7b1b4f43 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -75,9 +75,16 @@ PURE_FP16_BLACK_LIST = {
     'lookup_table', 'lookup_table_v2', 'scatter', 'scatter_grad'
 }
 
-BF16_WHITE_LIST = {'conv2d'}
+BF16_WHITE_LIST = {'conv2d', 'matmul_v2'}
 BF16_BLACK_LIST = {' '}
 
+_g_amp_state_ = None
+
+
+def amp_state():
+    global _g_amp_state_
+    return _g_amp_state_
+
 
 #NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
 # The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
@@ -240,6 +247,11 @@ def amp_guard(enable=True,
                 print(conv.dtype) # FP32
 
     """
+    amp_state = locals()
+    global _g_amp_state_
+    original_state = _g_amp_state_
+    _g_amp_state_ = amp_state
+
     # check amp_level: O0-O2
     level = level.upper()
     if not (level in ['O0', 'O1', 'O2']):
@@ -349,6 +361,7 @@ def amp_guard(enable=True,
         yield
     finally:
         if tracer:
+            _g_amp_state_ = original_state
             tracer._amp_level = original_amp_level
             tracer._set_amp_op_list(original_white_list, original_black_list)
             # set_flags(original_flags)
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index c4ea751ed92f8d12fd0b141b597e9810dac5200d..65bfba3f6c32e072a6db0e1d294a8c5fc07d9d74 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -33,10 +33,11 @@ import paddle.utils.deprecated as deprecated
 class TensorHookRemoveHelper(object):
     """
     A helper class that for removing Tensor gradient's hook.
+    NOTE(wuweilong):the operation weakref.ref(tensor) will cause some unexpected errors in eager mode.
     """
 
     def __init__(self, tensor, hook_id):
-        self._tensor_ref = weakref.ref(tensor)
+        self._tensor = tensor if core._in_eager_mode() else weakref.ref(tensor)
         self._hook_id = hook_id
 
     def remove(self):
@@ -46,7 +47,7 @@ class TensorHookRemoveHelper(object):
         Returns:
             bool: Return True if removed successfully
         """
-        tensor = self._tensor_ref()
+        tensor = self._tensor if core._in_eager_mode() else self._tensor()
         if tensor is not None:
             res = tensor._remove_grad_hook(self._hook_id)
             if res is True:
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 447d6457e0a3cd7451f654905931052d3217c1e9..e372727b0f0b6a338cd43ac81001bb32ffd03ecc 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1583,9 +1583,6 @@ class Executor(object):
             lr_sheduler = program.lr_sheduler
             lr_value = lr_sheduler()
             lr_var = program.global_block().vars[lr_sheduler._var_name]
-            if core.is_compiled_with_ipu():
-                if hasattr(program.lr_sheduler, 'lr_var'):
-                    lr_var = program.lr_sheduler.lr_var
             data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
             tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
             tensor.set(data, self.place)
diff --git a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
index 6e0b44b71f7f87447bd66a052f0a394ab38b2874..76158596cb815022bc1a92cde75c9bd51be92857 100644
--- a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 #include "paddle/extension.h"
 
 // The linear implemented here must be passed in bias
-std::vector<paddle::Tensor> PtenLinearForward(const paddle::Tensor& x,
-                                              const paddle::Tensor& weight,
-                                              const paddle::Tensor& bias) {
+std::vector<paddle::Tensor> PhiLinearForward(const paddle::Tensor& x,
+                                             const paddle::Tensor& weight,
+                                             const paddle::Tensor& bias) {
   return {
       paddle::experimental::add(paddle::experimental::matmul(x, weight), bias)};
 }
@@ -90,6 +90,6 @@ std::vector<paddle::DataType> LinearInferDtype(
 PD_BUILD_OP(pten_linear)
     .Inputs({"X", "Weight", "Bias"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(PtenLinearForward))
+    .SetKernelFn(PD_KERNEL(PhiLinearForward))
     .SetInferShapeFn(PD_INFER_SHAPE(LinearInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(LinearInferDtype));
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index ad0a81e725707335c9a76224ead882fae1e8162b..2361bd270623873384d3cea8cd11eb10a78ec116 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -590,7 +590,10 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op)
+if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+    py_test_modules(test_warpctc_op MODULES test_warpctc_op)
+    set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
+endif()
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
@@ -935,7 +938,7 @@ set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)
 set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES TIMEOUT 120)
-set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
+
 set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
index 2bea60c3ded1a09c8ff5c1e8ecb318a83691d536..729c9c46b4f0cab2374d951b54deeaffe9cb0c1d 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -10,6 +10,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
     list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass")
     list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass")
     list(REMOVE_ITEM TEST_OPS "test_dist_gradient_merge_pass")
+    list(REMOVE_ITEM TEST_OPS "test_dist_inplace_addto_pass")
     list(REMOVE_ITEM TEST_OPS "test_auto_parallel_amp_pass")
     list(REMOVE_ITEM TEST_OPS "test_auto_parallel_recompute_pass")
     list(REMOVE_ITEM TEST_OPS "test_auto_parallel_sharding_pass")
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dc0623a112f51fb74654f575db4194f06c79e5b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import MkldnnAutoScanTest
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+from functools import partial
+import unittest
+from hypothesis import given
+import hypothesis.strategies as st
+
+
+class TestMKLDNNLogSoftmaxOp(MkldnnAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self, *args, **kwargs):
+        def generate_input(*args, **kwargs):
+            return np.random.random(kwargs['in_shape']).astype(np.float32)
+
+        logsoftmax_op = OpConfig(
+            type="log_softmax",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["output_data"]},
+            attrs={"axis": kwargs['axis']})
+
+        program_config = ProgramConfig(
+            ops=[logsoftmax_op],
+            weights={},
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input,
+                                                            *args, **kwargs)),
+            },
+            outputs=["output_data"])
+
+        yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, (1e-5, 1e-5)
+
+    @given(
+        axis=st.sampled_from([-2, -1, 0, 1]),
+        in_shape=st.lists(
+            st.integers(
+                min_value=2, max_value=5), min_size=3, max_size=5))
+    def test(self, *args, **kwargs):
+        self.run_test(quant=False, *args, **kwargs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7477eaf3339b25a9c40fcf0870b55544e7cf5a2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+from paddle.fluid import core
+from paddle.fluid.tests.unittests.test_log_softmax import ref_log_softmax
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestLogSoftmaxOneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_softmax'
+        self.set_dtype()
+        self.set_shape()
+        self.set_axis()
+
+        x = np.random.uniform(0.1, 1.0, self.shape).astype(np.float32)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+
+        if self.dtype == np.uint16:
+            x = convert_float_to_uint16(x)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': True}
+
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_shape(self):
+        self.shape = [2, 3, 4, 5]
+
+    def set_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+
+class TestLogSoftmax1DOneDNNOp(TestLogSoftmaxOneDNNOp):
+    def set_shape(self):
+        self.shape = [100]
+
+
+class TestLogSoftmax3DOneDNNOp(TestLogSoftmaxOneDNNOp):
+    def set_shape(self):
+        self.shape = [12, 10, 3]
+
+
+class TestLogSoftmax5DOneDNNOp(TestLogSoftmaxOneDNNOp):
+    def set_shape(self):
+        self.shape = [2, 3, 4, 5, 6]
+
+
+class TestLogSoftmaxPositiveAxisOneDNNOp(TestLogSoftmaxOneDNNOp):
+    def set_axis(self):
+        self.axis = 2
+
+
+# BF16 TESTS
+class TestLogSoftmax1DBF16OneDNNOp(TestLogSoftmax1DOneDNNOp):
+    def set_dtype(self):
+        self.dtype = np.uint16
+
+
+class TestLogSoftmaxPositiveAxisBF16OneDNNOp(
+        TestLogSoftmaxPositiveAxisOneDNNOp):
+    def set_dtype(self):
+        self.dtype = np.uint16
+
+
+class TestLogSoftmax5DBF16OneDNNOp(TestLogSoftmax5DOneDNNOp):
+    def set_shape(self):
+        self.shape = [2, 3, 4, 5, 6]
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index 2e588355ce7939674fcddba58f477b19eac4dde2..c17790bd3200e2bac9841c1198572af6e1740420 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -1,9 +1,25 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+file(GLOB TEST_DIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_collective_*.py")
+string(REPLACE ".py" "" TEST_DIST_OPS "${TEST_DIST_OPS}")
 
 if (WITH_MLU)
+    foreach(TEST_OP ${TEST_DIST_OPS})
+        LIST(REMOVE_ITEM TEST_OPS ${TEST_OP})
+    endforeach(TEST_OP)
+
     foreach(TEST_OP ${TEST_OPS})
         py_test_modules(${TEST_OP} MODULES ${TEST_OP})
     endforeach(TEST_OP)
-    set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
+
+    if(WITH_CNCL)
+	foreach(TEST_OP ${TEST_DIST_OPS})
+            py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+        endforeach(TEST_OP)
+        bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+	bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+	bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+    endif(WITH_CNCL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0371e1bbb24061340209194dc720f64d1b3c39e3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base_mlu import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduce(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofallreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_allreduce_sum",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduce, "allreduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ea550a8452e4b60ec20fd3159bd78bd8d1e7368
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import paddle.fluid as fluid
+
+
+def train(prefix):
+    selected_mlus = os.getenv("FLAGS_selected_mlus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+
+    name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
+        .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
+
+    print(name)
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
+        f.write(name)
+
+
+def train_abort(prefix):
+    selected_mlus = os.getenv("FLAGS_selected_mlus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+
+    if trainer_id == 0:
+        try:
+            # train abort 
+            exit(1)
+        except SystemExit:
+            name = "abort>>> selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
+                .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
+            print(name)
+            with open(
+                    "multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                    "w") as f:
+                f.write(name)
+            raise
+    else:
+        # sleep 30s to make sure paddle.distributed.launch will terminate this process
+        time.sleep(30)
+        name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
+            .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
+
+        print(name)
+        with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                  "w") as f:
+            f.write(name)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3 and sys.argv[2] == "abort":
+        prefix = sys.argv[1]
+        train_abort(prefix)
+    else:
+        prefix = sys.argv[1]
+        train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py b/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2713532e41b2ffd6994278623153cd46163545
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+
+
+def train(prefix):
+    selected_mlus = os.getenv("FLAGS_selected_mlus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+
+    name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
+        .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
+
+    print(name)
+    with open("{}.check_{}.log".format(prefix, trainer_id), "w") as f:
+        f.write(name)
+
+
+if __name__ == '__main__':
+    prefix = sys.argv[1]
+    train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fd5db7a604d56fe427792be04d804250642dda0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCAllreduceOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_fp32(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "float32")
+
+    def test_allreduce_fp16(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "float16")
+
+    def test_allreduce_int32(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "int32")
+
+    def test_allreduce_int16(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "int16")
+
+    def test_allreduce_int8(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce", "int8")
+
+    def test_allreduce_uint8(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "uint8")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
index 2a7c64fe48972330b37ad8ab965f63f070e0ce65..4692c893d00b4595c1927f01c7e1b55dd6935c70 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -262,5 +262,13 @@ class TestDistBase(unittest.TestCase):
             need_result = input2
             self.assertTrue(np.allclose(tr0_out, need_result))
             self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc8a08c39ffc8c96f55f3367cf183448cf55d9ba
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py
@@ -0,0 +1,240 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, compiler, program_guard
+from paddle.fluid.op import Operator
+
+import sys
+sys.path.append('..')
+from op_test import OpTest, skip_check_grad_ci
+
+paddle.enable_static()
+
+
+class ElementwiseMulOp(OpTest):
+    def init_kernel_type(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', no_grad_set=set('Y'))
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x * self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1)
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100)
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 2, 11).astype(np.float32),
+            'Y': np.random.rand(10, 1, 11).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 4, 2, 3).astype(np.float32),
+            'Y': np.random.rand(10, 4, 1, 3).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOpFp16(ElementwiseMulOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(30, 3, 1, 5).astype(np.float32),
+            'Y': np.random.rand(30, 1, 4, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 10).astype(np.float32),
+            'Y': np.random.rand(2, 2, 10, 10).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y']
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_mul must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x1, y1)
+
+            # the input dtype of elementwise_mul must be float16 or float32 or int32
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..adf3019186163e98b18914958d529303d7dc8c27
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
+unset PADDLE_PORT
+export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
+export cluster_node_ips="127.0.0.1,127.0.0.2"
+export PADDLE_TRAINERS_NUM=2
+export POD_IP=127.0.0.1
+export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
+export PADDLE_TRAINER_ID=0
+
+export TRAINER_PORTS_NUM=2
+
+file_0="multi_process_fullpath_launch.check_0.log"
+file_1="multi_process_fullpath_launch.check_1.log"
+
+distributed_args="--ips=${cluster_node_ips} --mlus=0,1 --log_dir=testlog"
+
+echo "paddle.distributed.fleet.launch async poll process test"
+if ! MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process_mlu.py fullpath_launch abort; then
+    echo "train abort as planned"
+fi
+
+abort_str1="abort>>> selected_mlus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
+
+if grep -q "$abort_str1" "$file_0"; then
+    echo "trainer 0 abort as planned"
+else
+    echo "trainer 0 not abort as planned"
+    exit -1
+fi
+
+if [ ! -f $file_1 ]; then
+    echo "trainer 1 terminate as planned"
+else
+    echo "trainer 1 not terminate as planned"
+    rm $file_1
+    exit -1
+fi
+
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b93b21c1bdf6877801c1c4e99385c07fb1a894ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# use paddlecloud
+echo "begin test use paddlecloud"
+cluster_node_ips="127.0.0.1,127.0.0.2"
+export PADDLE_TRAINERS_NUM=2
+export POD_IP=127.0.0.1
+export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
+export PADDLE_TRAINER_ID=0
+
+export PADDLE_PORT=35789
+export TRAINER_PORTS_NUM=2
+
+distributed_args="--ips=${cluster_node_ips} --mlus=0,1 --log_dir=testlog"
+MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process_mlu.py fleetlaunchcloud
+
+str1="selected_mlus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
+str2="selected_mlus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
+file_0="multi_process_fleetlaunchcloud.check_0.log"
+file_1="multi_process_fleetlaunchcloud.check_1.log"
+
+echo "paddlecloud params test"
+if grep -q "$str1" "$file_0"; then
+    echo "find trainer 0"
+else
+    echo "not find trainer 0"
+    exit -1
+fi
+
+if grep -q "$str2" "$file_1"; then
+    echo "find trainer 1"
+else
+    echo "not find trainer 1"
+    exit -1
+fi
+
+# test async poll process
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
+if [ -f $file_1 ]; then
+    rm $file_1
+fi
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..722590dc87f09f67823e3eb9d95b69c9a0d29c6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+export FLAGS_START_PORT=35789
+
+export MLU_VISIBLE_DEVICES=0,1
+
+function test_nproc_0(){
+    mlus=$1
+    file_0="fleet_nproc_0.check_0.log"
+    rm -f ${file_0}
+    distributed_args="--log_dir=testlog --nproc_per_node=1"
+    python -m paddle.distributed.launch ${distributed_args} nproc_process_mlu.py  fleet_nproc_0
+
+    str0="selected_mlus:${mlus} worker_endpoints:127.0.0.1:35789 trainers_num:1 current_endpoint:127.0.0.1:35789 trainer_id:0"
+    if grep -q "$str0" "$file_0"; then
+        echo "find trainer 0"
+    else
+        echo "not find trainer 0"
+        exit -1
+    fi
+    if [ -f $file_0 ]; then
+        rm $file_0
+    fi
+}
+
+
+function test_nproc_1(){
+    file_0="fleet_nproc_1.check_0.log"
+    file_1="fleet_nproc_1.check_1.log"
+    rm -f ${file_0} ${file_1}
+
+    distributed_args="--log_dir=testlog --nproc_per_node=2"
+    python -m paddle.distributed.launch ${distributed_args} nproc_process_mlu.py  fleet_nproc_1
+
+    str0="selected_mlus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35789 trainer_id:0"
+    if grep -q "$str0" "$file_0"; then
+        echo "find trainer 0"
+    else
+        echo "not find trainer 0"
+        exit -1
+    fi
+
+    str1="selected_mlus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35790 trainer_id:1"
+    if grep -q "$str1" "$file_1"; then
+        echo "find trainer 1"
+    else
+        echo "not find trainer 1"
+        exit -1
+    fi
+    if [ -f $file_0 ]; then
+        rm $file_0
+    fi
+    if [ -f $file_1 ]; then
+        rm $file_1
+    fi
+}
+
+test_nproc_0 "0,1"
+
+test_nproc_1
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef33719d368e8b2ecb248f4e14f6bd6a031c26bb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py
@@ -0,0 +1,170 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+paddle.enable_static()
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestMLUReduceMaxOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [-1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMaxOpMultiAxises(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [-2, -1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceAll(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].max()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMaxOpWithOutDtype_int32(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.INT32)
+        }
+        self.outputs = {
+            'Out':
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(np.int32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMaxOpWithOutDtype_fp16(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.FP16)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].max(
+                axis=tuple(self.attrs['dim'])).astype(np.float16)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMaxOpWithOutDtype_fp32(TestMLUReduceMaxOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].max(
+                axis=tuple(self.attrs['dim'])).astype(np.float32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..284f8f984c232d3c178bcb13977b958ac4775a30
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py
@@ -0,0 +1,170 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+paddle.enable_static()
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestMLUReduceMinOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [-1]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMinOpMultiAxises(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [-2, -1]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceAll(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].min()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMinOpWithOutDtype_int32(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.INT32)
+        }
+        self.outputs = {
+            'Out':
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(np.int32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMinOpWithOutDtype_fp16(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.FP16)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].min(
+                axis=tuple(self.attrs['dim'])).astype(np.float16)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMinOpWithOutDtype_fp32(TestMLUReduceMinOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.set_mlu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].min(
+                axis=tuple(self.attrs['dim'])).astype(np.float32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index d067a2bd577880a58e757a422c52058661b4eedb..d1d391a3949ead28697c0756803e873c41914079 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
@@ -98,6 +98,46 @@ class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
                     place, atol=1e-3, check_dygraph=(self.use_mkldnn == False))
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+class TestBF16ElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.dtype = np.uint16
+
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        self.out = np.add(self.x, self.y)
+
+        self.axis = -1
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)),
+            'Y':
+            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y))
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': False}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y'))
+
+
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 7bace9bc535243194e2ed9ca82db49e6d1b4f2f4..00967cb503fe5fd677839a869798964bb5fb0b71 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -23,7 +23,7 @@ import paddle.fluid.core as core
 from paddle.fluid import Program, compiler, program_guard
 from paddle.fluid.op import Operator
 
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class ElementwiseMulOp(OpTest):
@@ -83,6 +83,39 @@ class ElementwiseMulOp(OpTest):
         pass
 
 
+class TestBF16ElementwiseMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.dtype = np.uint16
+
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        self.out = np.multiply(self.x, self.y)
+
+        self.axis = -1
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)),
+            'Y':
+            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': False}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMulOp_scalar(ElementwiseMulOp):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index 2594c96eebd69fcdd88d48e793e48d854b79535a..6801a4bc5f30b4829e8e9ceae201ab050b30758e 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -17,7 +17,8 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class TestElementwiseOp(OpTest):
@@ -44,6 +45,33 @@ class TestElementwiseOp(OpTest):
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
 
 
+class TestBF16ElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        out = x - y
+
+        self.inputs = {
+            'X': convert_float_to_uint16(x),
+            'Y': convert_float_to_uint16(y)
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseSubOp_scalar(TestElementwiseOp):
diff --git a/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py b/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
index 815598d9017665291878d43c6f1195d7681214f8..a429717bdaf37b3724820d3e074c38a216634cdf 100644
--- a/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
+++ b/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
@@ -19,13 +19,13 @@ from paddle import compat as cpt
 
 
 class TestGetAllRegisteredOpKernels(unittest.TestCase):
-    # reshape kernel is in fluid while not in pten
-    def test_pten_kernels(self):
-        self.assertTrue(core._get_all_register_op_kernels('pten')['sign'])
+    # reshape kernel is in fluid while not in phi
+    def test_phi_kernels(self):
+        self.assertTrue(core._get_all_register_op_kernels('phi')['sign'])
         with self.assertRaises(KeyError):
-            core._get_all_register_op_kernels('pten')['reshape']
+            core._get_all_register_op_kernels('phi')['reshape']
 
-    # sign kernel is removed from fluid and added into pten
+    # sign kernel is removed from fluid and added into phi
     def test_fluid_kernels(self):
         self.assertTrue(core._get_all_register_op_kernels('fluid')['reshape'])
         with self.assertRaises(KeyError):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 306c6b4707e8a3d7386bd8af3e32e55d09d563c4..5cb72512f99af7b4948e9fe4c01e9b993c1e247e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -20,6 +20,7 @@ import six
 from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting
 import paddle.nn as nn
 from paddle.static import InputSpec
+from paddle.autograd import PyLayer
 
 if fluid.core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
@@ -1130,20 +1131,55 @@ class TestBf16(unittest.TestCase):
     test amp for BF16 
     '''
 
-    def train(self, enable_amp=True):
+    def train(self, enable_amp=True, amp_level='O1'):
         paddle.seed(100)
         input = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
         conv = paddle.nn.Conv2D(4, 6, (3, 3))
         with paddle.amp.auto_cast(
-                enable=enable_amp, level='O2', dtype='bfloat16'):
+                enable=enable_amp, level=amp_level, dtype='bfloat16'):
             output = conv(input)
         output = output.cast('float32')
         return output.numpy()
 
     def test_bf16(self):
-        out_fp32 = self.train(enable_amp=False)
-        out_bf16 = self.train(enable_amp=True)
-        self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-2))
+        if fluid.core.is_compiled_with_cuda():
+            cudnn_version = paddle.device.get_cudnn_version()
+            if cudnn_version is not None and cudnn_version >= 8100:
+                out_fp32 = self.train(enable_amp=False)
+                out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
+                out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
+                self.assertTrue(
+                    np.allclose(
+                        out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
+                self.assertTrue(
+                    np.allclose(
+                        out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
+
+
+class TestPyLayerWithAmp(unittest.TestCase):
+    def test_pylayer(self):
+        class MyMM(PyLayer):
+            @staticmethod
+            def forward(ctx, a, b):
+                ctx.save_for_backward(a, b)
+                return a.mm(b)
+
+            @staticmethod
+            def backward(ctx, grad):
+                a, b = ctx.saved_tensor()
+                # NOTE(zhiqiu): a and b is float32 now, while grad is fp16 when forward runs with auto_cast()
+                # thus, the mm operation raise errors because of the dtype of inputs are inconsistent
+                return grad.mm(b.t()), a.t().mm(grad)
+
+        x = paddle.rand([10, 10])
+        y = paddle.rand([10, 10])
+        x.stop_gradient = False
+        y.stop_gradient = False
+
+        with paddle.amp.auto_cast():
+            res = MyMM.apply(x, y)
+            loss = paddle.mean(res)
+        loss.backward()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index 0dd6c9f893e2a78dff9f77617853b3d8e35a6648..d1437ca9c96f1ba5fd2b9e1e420f91414d4f923a 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -14,7 +14,7 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 import paddle
 import paddle.nn.functional as F
 
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index 52256766fed7585cc5815e636ecff8403d382c5e..3238876b89414b89d09a8b4161ef9e5ba2450261 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -19,6 +19,7 @@ import numpy as np
 
 import paddle
 import paddle.nn as nn
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class SimpleNet(nn.Layer):
@@ -64,7 +65,7 @@ class TestTensorRegisterHook(unittest.TestCase):
         if paddle.is_compiled_with_cuda():
             self.devices.append("gpu")
 
-    def test_hook_for_interior_var(self):
+    def func_hook_for_interior_var(self):
         def run_double_hook_for_interior_var(double_hook, removed=False):
             for device in self.devices:
                 paddle.set_device(device)
@@ -154,7 +155,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         # register hook and removed
         run_print_hook_for_interior_var(print_hook, removed=True)
 
-    def test_hook_for_leaf_var(self):
+    def test_hook_for_interior_var(self):
+        with _test_eager_guard():
+            self.func_hook_for_interior_var()
+        self.func_hook_for_interior_var()
+
+    def func_hook_for_leaf_var(self):
         def run_double_hook_for_leaf_var(double_hook, removed=False):
             for device in self.devices:
                 paddle.set_device(device)
@@ -193,7 +199,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         # register hook and removed
         run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
 
-    def test_hook_for_accumulated_grad_interior_var(self):
+    def test_hook_for_leaf_var(self):
+        with _test_eager_guard():
+            self.func_hook_for_leaf_var()
+        self.func_hook_for_leaf_var()
+
+    def func_hook_for_accumulated_grad_interior_var(self):
         def run_double_hook_for_accumulated_grad_interior_var(double_hook,
                                                               removed=False):
             for device in self.devices:
@@ -248,7 +259,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         run_double_hook_for_accumulated_grad_interior_var(
             lambda grad: grad * 2, removed=True)
 
-    def test_hook_for_accumulated_grad_leaf_var(self):
+    def test_hook_for_accumulated_grad_interior_var(self):
+        with _test_eager_guard():
+            self.func_hook_for_accumulated_grad_interior_var()
+        self.func_hook_for_accumulated_grad_interior_var()
+
+    def func_hook_for_accumulated_grad_leaf_var(self):
         def run_double_hook_for_accumulated_grad_leaf_var(double_hook,
                                                           removed=False):
             for device in self.devices:
@@ -289,7 +305,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         run_double_hook_for_accumulated_grad_leaf_var(
             lambda grad: grad * 2, removed=True)
 
-    def test_hook_in_model(self):
+    def test_hook_for_accumulated_grad_leaf_var(self):
+        with _test_eager_guard():
+            self.func_hook_for_accumulated_grad_leaf_var()
+        self.func_hook_for_accumulated_grad_leaf_var()
+
+    def func_hook_in_model(self):
         def run_double_hook_in_model(data,
                                      label,
                                      hook=None,
@@ -336,7 +357,12 @@ class TestTensorRegisterHook(unittest.TestCase):
         self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm))
         self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm))
 
-    def test_multiple_hooks_for_interior_var(self):
+    def test_func_hook_in_model(self):
+        with _test_eager_guard():
+            self.func_hook_in_model()
+        self.func_hook_in_model()
+
+    def func_multiple_hooks_for_interior_var(self):
         def run_multiple_hooks_for_interior_var(device,
                                                 hooks,
                                                 remove1=False,
@@ -414,6 +440,12 @@ class TestTensorRegisterHook(unittest.TestCase):
             self.assertTrue(np.array_equal(x_grad, z))
             self.assertTrue(np.array_equal(y_grad, z))
 
+    def test_multiple_hooks_for_interior_var(self):
+        with _test_eager_guard():
+            self.func_multiple_hooks_for_interior_var()
+        self.func_multiple_hooks_for_interior_var()
+
+    # TODO(wuweilong): enable this case when DoubleGrad in eager mode is ready
     def test_hook_in_double_grad(self):
         def double_print_hook(grad):
             grad = grad * 2
@@ -446,7 +478,7 @@ class TestTensorRegisterHook(unittest.TestCase):
         z.backward()
         self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.])))
 
-    def test_remove_one_hook_multiple_times(self):
+    def func_remove_one_hook_multiple_times(self):
         for device in self.devices:
             paddle.set_device(device)
 
@@ -457,7 +489,12 @@ class TestTensorRegisterHook(unittest.TestCase):
             self.assertTrue(h.remove())
             self.assertFalse(h.remove())
 
-    def test_register_hook_for_stop_gradient_var(self):
+    def test_remove_one_hook_multiple_times(self):
+        with _test_eager_guard():
+            self.func_remove_one_hook_multiple_times()
+        self.func_remove_one_hook_multiple_times()
+
+    def func_register_hook_for_stop_gradient_var(self):
         for device in self.devices:
             paddle.set_device(device)
 
@@ -466,6 +503,11 @@ class TestTensorRegisterHook(unittest.TestCase):
             with self.assertRaises(RuntimeError):
                 x.register_hook(lambda grad: grad * 2)
 
+    def test_register_hook_for_stop_gradient_var(self):
+        with _test_eager_guard():
+            self.func_register_hook_for_stop_gradient_var()
+        self.func_register_hook_for_stop_gradient_var()
+
     def test_register_hook_in_static_mode(self):
         paddle.enable_static()
 
@@ -482,7 +524,7 @@ class TestTensorRegisterHook(unittest.TestCase):
 
         paddle.disable_static()
 
-    def test_register_hook_in_dy2static_mode(self):
+    def func_register_hook_in_dy2static_mode(self):
         net = SimpleNetForStatic(self.in_size, self.out_size)
         jit_net = paddle.jit.to_static(
             net, input_spec=[paddle.static.InputSpec([None, self.in_size])])
@@ -491,8 +533,17 @@ class TestTensorRegisterHook(unittest.TestCase):
             size=[self.batch_size, self.in_size]).astype('float32')
         data_t = paddle.to_tensor(data)
 
-        with self.assertRaises(AssertionError):
-            out = jit_net(data_t)
+        if _in_eager_mode():
+            with self.assertRaises(TypeError):
+                out = jit_net(data_t)
+        else:
+            with self.assertRaises(AssertionError):
+                out = jit_net(data_t)
+
+    def test_register_hook_in_dy2static_mode(self):
+        with _test_eager_guard():
+            self.func_register_hook_in_dy2static_mode()
+        self.func_register_hook_in_dy2static_mode()
 
 
 HOOK_INIT_VALUE = 10
@@ -512,7 +563,7 @@ class TestTensorRegisterBackwardHook(unittest.TestCase):
         if paddle.is_compiled_with_cuda():
             self.devices.append("gpu")
 
-    def test_register_backward_hook(self):
+    def func_register_backward_hook(self):
         global HOOK_INIT_VALUE
         global HOOK_IS_CALLED
         for device in self.devices:
@@ -529,20 +580,35 @@ class TestTensorRegisterBackwardHook(unittest.TestCase):
             HOOK_INIT_VALUE = 10
             HOOK_IS_CALLED = False
 
-    def test_register_backward_hook_for_interior_var(self):
+    def test_register_backward_hook(self):
+        with _test_eager_guard():
+            self.func_register_backward_hook()
+        self.func_register_backward_hook()
+
+    def func_register_backward_hook_for_interior_var(self):
         x = paddle.to_tensor(5., stop_gradient=False)
         y = paddle.pow(x, 4.0)
 
         with self.assertRaises(ValueError):
             y._register_backward_hook(global_void_hook)
 
-    def test_register_backward_hook_for_var_without_gradient(self):
+    def test_register_backward_hook_for_interior_var(self):
+        with _test_eager_guard():
+            self.func_register_backward_hook_for_interior_var()
+        self.func_register_backward_hook_for_interior_var()
+
+    def func_register_backward_hook_for_var_without_gradient(self):
         x = paddle.to_tensor(5.)
         y = paddle.pow(x, 4.0)
 
         with self.assertRaises(ValueError):
             x._register_backward_hook(global_void_hook)
 
+    def test_register_backward_hook_for_var_without_gradient(self):
+        with _test_eager_guard():
+            self.func_register_backward_hook_for_var_without_gradient()
+        self.func_register_backward_hook_for_var_without_gradient()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py
index d601117b96f12d35756b521b85902bf91ef01bae..7fb4d39cd7338fb3cd57c786bc811b901351eaf9 100644
--- a/python/paddle/fluid/tests/unittests/test_where_op.py
+++ b/python/paddle/fluid/tests/unittests/test_where_op.py
@@ -139,6 +139,28 @@ class TestWhereAPI(unittest.TestCase):
                               fetch_list=[result])
                 assert np.array_equal(out[0], np.where((x_i > 1), x_i, y_i))
 
+    def test_scalar(self):
+        paddle.enable_static()
+        main_program = Program()
+        with fluid.program_guard(main_program):
+            cond_shape = [2, 4]
+            cond = fluid.layers.data(
+                name='cond', shape=cond_shape, dtype='bool')
+            x_data = 1.0
+            y_data = 2.0
+            cond_data = np.array([False, False, True, True]).astype('bool')
+            result = paddle.where(condition=cond, x=x_data, y=y_data)
+            for use_cuda in [False, True]:
+                if (use_cuda and (not fluid.core.is_compiled_with_cuda())):
+                    return
+                place = (fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace())
+                exe = fluid.Executor(place)
+                out = exe.run(fluid.default_main_program(),
+                              feed={'cond': cond_data},
+                              fetch_list=[result])
+                expect = np.where(cond_data, x_data, y_data)
+                assert np.array_equal(out[0], expect)
+
     def __test_where_with_broadcast_static(self, cond_shape, x_shape, y_shape):
         paddle.enable_static()
         main_program = Program()
@@ -227,6 +249,15 @@ class TestWhereDygraphAPI(unittest.TestCase):
             out = paddle.where(cond, x, y)
             assert np.array_equal(out.numpy(), np.where(cond_i, x_i, y_i))
 
+    def test_scalar(self):
+        with fluid.dygraph.guard():
+            cond_i = np.array([False, False, True, True]).astype('bool')
+            x = 1.0
+            y = 2.0
+            cond = fluid.dygraph.to_variable(cond_i)
+            out = paddle.where(cond, x, y)
+            assert np.array_equal(out.numpy(), np.where(cond_i, x, y))
+
     def __test_where_with_broadcast_dygraph(self, cond_shape, a_shape, b_shape):
         with fluid.dygraph.guard():
             cond_tmp = paddle.rand(cond_shape)
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index 74c481fb641aca9a7249f1ce9585700e66301d3b..e7c3cfbb7b93b5deffb95e9ee175a7a03d1aaf7f 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -178,11 +178,13 @@ class DistributedFusedLamb(Optimizer):
         param_info = self._create_persistable_var('param_info', dtype='int32')
         param_info.is_distributed = True
 
-        fused_offsets = self._create_persistable_var('fused_offsets')
+        fused_offsets = self._create_persistable_var(
+            'fused_offsets', dtype='int32')
 
         fp32_partial_fused_offsets = self._create_persistable_var(
             'fp32_partial_fused_offsets', dtype='int32')
         fp32_partial_fused_offsets.is_distributed = True
+
         fp16_partial_fused_offsets = self._create_persistable_var(
             'fp16_partial_fused_offsets', dtype='int32')
         fp16_partial_fused_offsets.is_distributed = True
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 5c5517e54f71ad8dde7999561953ca4c03680b90..ecf70ffe4a1dd3179d02a2a6ca1e260e8193d1d1 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -543,8 +543,8 @@ def where(condition, x=None, y=None, name=None):
 
     Args:
         condition(Tensor): The condition to choose x or y.
-        x(Tensor, optional): x is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
-        y(Tensor, optional): y is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
+        x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
+        y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
 
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -571,6 +571,12 @@ def where(condition, x=None, y=None, name=None):
           #            [[2],
           #             [3]]),)
     """
+    if np.isscalar(x):
+        x = layers.fill_constant([1], np.array([x]).dtype.name, x)
+
+    if np.isscalar(y):
+        y = layers.fill_constant([1], np.array([y]).dtype.name, y)
+
     if x is None and y is None:
         return nonzero(condition, as_tuple=True)
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 390ccdd157363260435b97993b3181a8532a2d15..7ea8493b67fd6dec6f46df8ca854bbd700ffbfa6 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -34,6 +34,11 @@
   kernel :
     func : conj
 
+- api : copy_to
+  args : (Tensor x, Backend backend, bool blocking)
+  output : Tensor
+  invoke : copy_to_impl(x, backend, blocking)
+
 - api : divide
   args : (Tensor x, Tensor y)
   output : Tensor
@@ -162,6 +167,11 @@
   kernel :
     func : sign
 
+- api : split
+  args : (Tensor x, ScalarArray num_or_sections, Scalar axis)
+  output : Tensor[]
+  invoke : split_impl(x, num_or_sections, axis)
+
 - api : subtract
   args : (Tensor x, Tensor y)
   output : Tensor
@@ -177,7 +187,6 @@
     func : SumInferMeta
   kernel :
     func : sum
-    param : [x, axis, dtype, keep_dim]
     data_type : x
 
 - api : zeros_like
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 997b64db967916eac297b52074277bafbc67c2e7..5fc9dfe3f6499701f75fffc62bdcf3f9a0c28821 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -35,7 +35,7 @@ class BaseAPI(object):
         # args_str:
         #     args_declare : "str" // str of function params with default value. Example: (..., bool flag=false)
         #     args_define : "str" // str of function params without default value. Example: (..., bool flag)
-        self.inputs, self.attrs, self.outputs, self.args_str = self.parse_args(
+        self.inputs, self.attrs, self.outputs, self.args_str, self.optional_vars = self.parse_args(
             self.api, api_item_yaml)
 
         self.is_base_api = True
@@ -57,17 +57,22 @@ class BaseAPI(object):
         return self.api
 
     def parse_args(self, api_name, api_item_yaml):
+        optional_vars = []
+        if 'optional' in api_item_yaml:
+            optional_vars = [
+                item.strip() for item in api_item_yaml['optional'].split(',')
+            ]
         inputs, attrs, args_str = self.parse_input_and_attr(
-            api_name, api_item_yaml['args'])
+            api_name, api_item_yaml['args'], optional_vars)
         output_type_list, output_names, return_type = self.parse_output(
             api_name, api_item_yaml['output'])
         return inputs, attrs, {
             'names': output_names,
             'types': output_type_list,
             'return_type': return_type
-        }, args_str
+        }, args_str, optional_vars
 
-    def parse_input_and_attr(self, api_name, args_config):
+    def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
         inputs = {'names': [], 'input_info': {}}
         attrs = {'names': [], 'attr_info': {}}
         args_str = args_config.strip()
@@ -79,11 +84,43 @@ class BaseAPI(object):
             'Tensor': 'const Tensor&',
             'Tensor[]': 'const std::vector<Tensor>&'
         }
-        attr_types_map = {'ScalarArray' : 'const ScalarArray&', 'Scalar' : 'const Scalar&', \
-                      'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
-                      'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
-                      'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \
-                      'int64_t[]' : 'const std::vector<int64_t>&', 'int[]' : 'const std::vector<int>&'}
+        attr_types_map = {
+            'ScalarArray': 'const ScalarArray&',
+            'Scalar': 'const Scalar&',
+            'int': 'int',
+            'int32_t': 'int32_t',
+            'int64_t': 'int64_t',
+            'long': 'long',
+            'size_t': 'size_t',
+            'float': 'float',
+            'double': 'double',
+            'bool': 'bool',
+            'Backend': 'Backend',
+            'DataLayout': 'DataLayout',
+            'DataType': 'DataType',
+            'int64_t[]': 'const std::vector<int64_t>&',
+            'int[]': 'const std::vector<int>&',
+            'long[]': 'const std::vector<int64_t>&'
+        }
+        optional_types_trans = {
+            'Tensor': 'const paddle::optional<Tensor>&',
+            'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
+            'ScalarArray': 'const paddle::optional<ScalarArray>&',
+            'Scalar': 'const paddle::optional<Scalar>&',
+            'int': 'paddle::optional<int>',
+            'int32_t': 'paddle::optional<int32_t>',
+            'int64_t': 'paddle::optional<int64_t>',
+            'size_t': 'paddle::optional<size_t>',
+            'float': 'paddle::optional<float>',
+            'double': 'paddle::optional<double>',
+            'bool': 'paddle::optional<bool>',
+            'Backend': 'paddle::optional<Backend>',
+            'DataLayout': 'paddle::optional<DataLayout>',
+            'DataType': 'paddle::optional<DataType>',
+            'int64_t[]': 'paddle::optional<std::vector<int64_t>>',
+            'int[]': 'paddle::optional<std::vector<int>>'
+        }
+
         args_declare_str = ""
         args_define_str = ""
 
@@ -100,6 +137,9 @@ class BaseAPI(object):
                     assert len(attrs['names']) == 0, \
                         f"The input Tensor should appear before attributes. please check the position of {api_name}:input({input_name}) in yaml"
 
+                    if input_name in optional_vars:
+                        in_type = optional_types_trans[in_type_symbol]
+
                     inputs['names'].append(input_name)
                     inputs['input_info'][input_name] = in_type
                     args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
@@ -121,6 +161,9 @@ class BaseAPI(object):
                         attr_name = attr_infos[0].strip()
                         default_value = attr_infos[1].strip()
 
+                    if attr_name in optional_vars:
+                        attr_type = optional_types_trans[attr_type_symbol]
+
                     default_value_str = "" if default_value is None else '=' + default_value
                     args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
                     args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
@@ -381,7 +424,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         || kernel_layout == DataLayout::UNDEFINED
         || kernel_data_type == DataType::UNDEFINED ) {{
     auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
     if (kernel_backend == Backend::UNDEFINED) {{
       kernel_backend = kernel_key.backend();
     }}
@@ -408,7 +451,17 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         param_code = ""
         for param in infer_meta_params:
             if param in input_names:
-                param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
+                if param in self.optional_vars:
+                    meta_tensor_code = meta_tensor_code + f"""
+{code_indent}  paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none);
+{code_indent}  auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
+{code_indent}  if ({PREFIX_TENSOR_NAME}meta_{param}) {{
+{code_indent}    {PREFIX_TENSOR_NAME}meta_ref_{param} = paddle::make_optional<const phi::MetaTensor&>(*{PREFIX_TENSOR_NAME}meta_{param});
+{code_indent}  }}"""
+
+                    param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, "
+                else:
+                    param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
             elif param in kernel_output_names:
                 meta_tensor_code = meta_tensor_code + code_indent + "  phi::MetaTensor " + param.replace(
                     'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n"
@@ -435,7 +488,11 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
             'const std::vector<Tensor>&':
             'const std::vector<phi::DenseTensor>&',
             'const std::vector<Tensor> &':
-            'const std::vector<phi::DenseTensor>&'
+            'const std::vector<phi::DenseTensor>&',
+            'const paddle::optional<Tensor>&':
+            'paddle::optional<const phi::DenseTensor&>',
+            'const paddle::optional<std::vector<Tensor>>&':
+            'paddle::optional<const std::vector<phi::DenseTensor>&>'
         }
         out_trans_map = {
             'Tensor': 'phi::DenseTensor*',
@@ -459,19 +516,40 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
                     trans_flag = "{true}"
                 elif input_name in self.data_transform['support_trans_dtype']:
                     trans_flag = "{false, true}"
-                input_tensor_code = input_tensor_code + f"""
+                if input_name in self.optional_vars:
+                    input_tensor_code = input_tensor_code + f"""
+{code_indent}  {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none);
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_ptr = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});
+{code_indent}  if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{
+{code_indent}    {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional<const phi::DenseTensor&>(*{PREFIX_TENSOR_NAME}{input_name}_ptr);
+{code_indent}  }}"""
+
+                else:
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});"""
 
             else:
-                input_tensor_code = input_tensor_code + f"""
+                if input_name in self.optional_vars:
+                    input_tensor_code = input_tensor_code + f"""
+{code_indent}  {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none);
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_ptr = TensorToDenseTensor({input_name});
+{code_indent}  if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{
+{code_indent}    {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional<const phi::DenseTensor&>(*{PREFIX_TENSOR_NAME}{input_name}_ptr);
+{code_indent}  }}"""
+
+                else:
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = TensorToDenseTensor({input_name});"""
 
         kernel_args = "*dev_ctx, "
         for param in kernel_param:
             if param in input_names:
-                kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
-                kernel_args_type_list.append(input_trans_map[input_infos[
-                    param]])
+                if param in self.optional_vars:
+                    kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
+                else:
+                    kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                kernel_in_type = input_trans_map[input_infos[param]]
+                kernel_args_type_list.append(kernel_in_type)
             elif param in attr_names:
                 # set attr for kernel_context
                 if 'ScalarArray' in self.attrs['attr_info'][param][0]:
@@ -499,21 +577,16 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
     def get_selected_rows_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::SelectedRows&',
-            'const Tensor &': 'const phi::SelectedRows&'
+            'const Tensor &': 'const phi::SelectedRows&',
+            'const paddle::optional<Tensor>&':
+            'paddle::optional<const phi::SelectedRows&>'
         }
         out_trans_map = {'Tensor': 'phi::SelectedRows*'}
         input_names = self.inputs['names']
         input_infos = self.inputs['input_info']
         kernel_args_type_list = ['const platform::DeviceContext&']
 
-        input_tensor_code = ""
-        for input_name in input_names:
-            # set input code
-            input_tensor_code = input_tensor_code + f"""
-      auto {PREFIX_TENSOR_NAME}{input_name} = TensorToSelectedRows({input_name});"""
-
         attr_names = self.attrs['names']
-
         kernel_param = self.kernel['param']
         if kernel_param is None:
             kernel_param = input_names + attr_names
@@ -521,15 +594,28 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         input_tensor_code = ""
         for i, input_name in enumerate(input_names):
             # set input code
-            input_tensor_code = input_tensor_code + f"""
+            if input_name in self.optional_vars:
+                input_tensor_code = input_tensor_code + f"""
+
+{code_indent}  {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none);
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_ptr = TensorToSelectedRows({input_name});
+{code_indent}  if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{
+{code_indent}    {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional<const phi::SelectedRows&>(*{PREFIX_TENSOR_NAME}{input_name}_ptr);
+{code_indent}  }}"""
+
+            else:
+                input_tensor_code = input_tensor_code + f"""
 {code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = TensorToSelectedRows({input_name});"""
 
         kernel_args = "*dev_ctx, "
         for param in kernel_param:
             if param in input_names:
-                kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
-                kernel_args_type_list.append(input_trans_map[input_infos[
-                    param]])
+                if param in self.optional_vars:
+                    kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
+                else:
+                    kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                kernel_in_type = input_trans_map[input_infos[param]]
+                kernel_args_type_list.append(kernel_in_type)
             elif param in attr_names:
                 # set attr for kernel_context
                 if 'ScalarArray' in self.attrs['attr_info'][param][0]:
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index f1e69a21f28d86e424bd2bb91732e29c7a2971d9..a26630ad04100fbebdb7c270b83912bb722040d4 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -92,6 +92,7 @@ def header_include():
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
 """
 
 
@@ -102,6 +103,7 @@ def source_include(header_file_path):
 
 #include "glog/logging.h"
 
+#include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/api_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 62b724432e9283613f69852ec04eda55a88b0ab2..cdda5cb1f05e84f05f468837dae1f59116fa293f 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -8,6 +8,17 @@
   kernel :
     func : matmul_grad
 
+- backward_api : matmul_double_grad
+  forward : matmul_grad (Tensor x, Tensor y, Tensor out_grad, bool transpose_x, bool transpose_y) -> Tensor(dx), Tensor(dy)
+  args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y)
+  output : Tensor(d2x), Tensor(d2y), Tensor(dout_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x, y, out_grad]
+  kernel :
+    func : matmul_double_grad
+  optional : dx_grad, dy_grad
+
 - backward_api : scale_grad
   forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out)
   args : (Tensor out_grad, Scalar scale, float bias=0.0, bool bias_after_scale=true)
@@ -15,15 +26,6 @@
   invoke : scale(out_grad, scale, bias, bias_after_scale)
 
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
-#
-# - backward_api : matmul_double_grad
-#   forward : matmul_grad (Tensor x, Tensor y, Tensor out_grad, bool transpose_x, bool transpose_y) -> Tensor(dx), Tensor>(dy)
-#   args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y)
-#   output : Tensor(d2x), Tensor(d2y), Tensor(dout_grad)
-#   infer_meta :
-#     func : MatmulDoubleGradInferMeta
-#   kernel :
-#     func : matmul_double_grad
 
 # - backward_api : matmul_triple_grad
 #   forward : matmul_double_grad (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y) -> Tensor(d2x), Tensor(d2y), Tensor(dout_grad)
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 28eb1de37b697d22ac37fd5ff03a0b4debcbd2b3..2d33cd5b1812ada8fca118c0e0f616cfbe511dd1 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -31,10 +31,10 @@ class BackwardAPI(BaseAPI):
     def parse_forward_config(self, forward_config):
         # api_name (const Tensor& input, ... , int attr, ...) -> Tensor(out)
         result = re.search(
-            r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->[^\(]*\((?P<outputs>[^\)]+)\)",
+            r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->\s*(?P<outputs>.+)",
             forward_config)
         api = result.group('api')
-        outputs = [item.strip() for item in result.group('outputs').split(',')]
+        _, outputs, _ = self.parse_output(self.api, result.group('outputs'))
         fw_inputs, fw_attrs, _, = self.parse_input_and_attr(
             api, result.group('args'))
 
@@ -47,7 +47,7 @@ class BackwardAPI(BaseAPI):
 
         # check the inputs of backward
         for input in self.inputs['names']:
-            if input not in fw_inputs and input not in fw_outputs:
+            if input not in fw_inputs['names'] and input not in fw_outputs:
                 if input.endswith('_grad'):
                     original_name = input[:-5]
                     assert original_name in fw_outputs, \
@@ -132,6 +132,7 @@ def header_include():
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
 """
 
 
@@ -142,6 +143,7 @@ def source_include(header_file_path):
 
 #include "glog/logging.h"
 
+#include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/api_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
diff --git a/python/setup.py.in b/python/setup.py.in
index f39429387dbc3e309edd1ebc60767071811c5ee4..ec1b1cbcb9510c80a42dff49fa1a5121a9cb487f 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -571,13 +571,13 @@ def find_files(pattern, root, recursive=False):
 headers = (
     # paddle level api headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api')) +  # pten unify api header
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api')) +  # phi unify api header
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/ext')) +  # custom op api
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) +  # pten api
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) +  # pten common headers
-    # pten level api headers (low level api)
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # pten core headers
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) +  # pten backends headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) +  # phi api
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) +  # phi common headers
+    # phi level api headers (low level api)
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) +  # phi backends headers
     # utila api headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) +  # paddle utils headers
     ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
diff --git a/tools/check_file_suffix.py b/tools/check_file_suffix.py
deleted file mode 100644
index 1d422dd6c4fe0272d6ab1425c05591d449953591..0000000000000000000000000000000000000000
--- a/tools/check_file_suffix.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import json
-
-
-def check_suffix():
-    suffix_arr = [".pyc"]
-    json_buff = ""
-    for line in sys.stdin:
-        json_buff = "".join([json_buff, line])
-    json_obj = json.loads(json_buff)
-    if not isinstance(json_obj, list):
-        print('Json String Should be a list Object\n')
-        return
-    files_with_invalid_suffix = []
-    for i in range(len(json_obj)):
-        file_name = json_obj[i]["filename"]
-        if file_name == None:
-            continue
-        for suffix in suffix_arr:
-            if file_name.endswith(suffix):
-                files_with_invalid_suffix.append(file_name)
-                break
-    if len(files_with_invalid_suffix) != 0:
-        print('Error: Find file(s): [\n')
-        for i in range(len(files_with_invalid_suffix)):
-            print('\t' + files_with_invalid_suffix[i] + '\n')
-        print(
-            ' ] end(s) with invalid suffix, Please check if these files are temporary.'
-        )
-
-
-if __name__ == "__main__":
-    check_suffix()
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index b0c834718b1b3b457b2784243c62fe848eeed902..f3e9f345da27b4c4bb06499dfc14b12cbd406715 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -219,7 +219,7 @@ def gen_register_info(resources: List[List[str]]):
         for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
             kernel_func = gen_kernel_func(update_item[3], ctx_name,
                                           origin_dtype)
-            ir_name = '.'.join(
+            ir_name = 'pten.' + '.'.join(
                 [it.lower() for it in update_item[:3]]) + "." + ir_dtype
             res += f"""
   registry->AddKernel("{ir_name}","""