Merge the develop branch (#39362)

3cca89e7 · 石晓伟 · GitHub · d89f246c · 3cca89e7 · 3cca89e7
486 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -4,10 +4,13 @@ paddle/fluid/API_DEV.spec
 paddle/fluid/API_PR.spec
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
-paddle/pten/api/*/api.*
-paddle/pten/api/*/backward*
+paddle/pten/api/include/api.h
+paddle/pten/api/lib/api.cc
+paddle/pten/api/backward/backward_api.h
+paddle/pten/api/lib/backward_api.cc
 paddle/pten/include/*
 paddle/pten/extension.h
+paddle/fluid/eager/api/generated/*

 *.DS_Store
 *.vs

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
+option(WITH_XPU_KP      "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
 option(WITH_MLU    "Compile PaddlePaddle with CAMBRICON MLU"     OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
@@ -59,6 +60,9 @@ include(generic)            # simplify cmake module
 if (WITH_GPU  AND WITH_XPU)
    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
+if (WITH_GPU AND WITH_XPU_KP)
+    message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
+endif()
 if (WITH_GPU AND WITH_ASCEND)
    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
@@ -226,6 +230,7 @@ option(WITH_INFRT  "Compile PaddlePaddle with INFRT" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
 option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
+option(WITH_CNCL   "Compile PaddlePaddle with CNCL support"             OFF)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
 option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
@@ -273,6 +278,14 @@ if (NOT WITH_GPU AND WITH_NCCL)
        "Disable NCCL when compiling without GPU" FORCE)
 endif()

+# force WITH_XPU on when WITH_XPU_KP
+if (WITH_XPU_KP AND NOT WITH_XPU)
+    MESSAGE(WARNING
+        "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
+    set(WITH_XPU ON CACHE STRING
+        "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
+endif()
+
 if (NOT WITH_XPU AND WITH_XPU_BKCL)
    MESSAGE(WARNING
        "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
@@ -280,6 +293,13 @@ if (NOT WITH_XPU AND WITH_XPU_BKCL)
        "Disable BKCL when compiling without XPU" FORCE)
 endif()

+if (NOT WITH_MLU AND WITH_CNCL)
+    MESSAGE(WARNING
+        "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
+    set(WITH_MLU OFF CACHE STRING
+        "Disable CNCL when compiling without MLU" FORCE)
+endif()
+
 if(WITH_NCCL)
     add_definitions("-DPADDLE_WITH_NCCL")
     include(nccl)
@@ -317,6 +337,10 @@ if(WITH_ROCM)
    include(miopen) # set miopen libraries, must before configure
 endif(WITH_ROCM)

+if(WITH_XPU_KP)
+    include(xpu_kp)
+endif()
+
 if (NOT WITH_ROCM AND WITH_RCCL)
    MESSAGE(WARNING
        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -99,6 +99,11 @@ if(WITH_XPU)
    add_definitions(-DPADDLE_WITH_XPU)
 endif()

+if(WITH_XPU_KP)
+    message(STATUS "Compile with XPU_KP!")
+    add_definitions(-DPADDLE_WITH_XPU_KP)
+endif()
+
 if(WITH_IPU)
    message(STATUS "Compile with IPU!")
    add_definitions(-DPADDLE_WITH_IPU)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -654,6 +654,81 @@ function(hip_test TARGET_NAME)
  endif()
 endfunction(hip_test)

+function(xpu_library TARGET_NAME)
+  if (WITH_XPU_KP)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(xpu_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(xpu_library_SRCS)
+      if (xpu_library_SHARED OR xpu_library_shared) # build *.so
+        message(FATAL_ERROR "XPU kernel currently does not support dynamic links")
+      else()
+        xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
+        find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (xpu_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${xpu_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND xpu_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+    else(xpu_library_SRCS)
+      if (xpu_library_DEPS)
+        list(REMOVE_DUPLICATES xpu_library_DEPS)
+        generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:xpu_library")
+        target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
+        add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in xpu_library.")
+      endif()
+    endif(xpu_library_SRCS)
+  endif()
+endfunction(xpu_library)
+
+function(xpu_binary TARGET_NAME)
+  if (WITH_XPU_KP)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(xpu_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${xpu_binary_SRCS})
+    if(xpu_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${xpu_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${xpu_binary_DEPS})
+      common_link(${TARGET_NAME})
+    endif()
+  endif()
+endfunction(xpu_binary)
+
+function(xpu_test TARGET_NAME)
+  # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
+  # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
+  # other than *.py are modified.
+  if (WITH_XPU_KP AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${xpu_test_SRCS})
+    # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
+    target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
+    add_dependencies(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+  endif()
+endfunction(xpu_test)
+
 function(go_library TARGET_NAME)
  set(options STATIC static SHARED shared)
  set(oneValueArgs "")

--- a/cmake/neuware.cmake
+++ b/cmake/neuware.cmake
@@ -19,4 +19,11 @@ set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
 set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)

 generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
-TARGET_LINK_LIBRARIES(neuware_lib ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
+if(WITH_CNCL)
+      MESSAGE(STATUS "Compile with CNCL!")
+      ADD_DEFINITIONS(-DPADDLE_WITH_CNCL)
+      set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
+      TARGET_LINK_LIBRARIES(neuware_lib ${CNCL_LIB} ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
+else()
+      TARGET_LINK_LIBRARIES(neuware_lib ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
+endif()
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -34,6 +34,7 @@ function(op_library TARGET)
    set(cu_cc_srcs)
    set(hip_cc_srcs)
    set(xpu_cc_srcs)
+    set(xpu_kp_cc_srcs)
    set(npu_cc_srcs)
    set(mlu_cc_srcs)
    set(cudnn_cu_cc_srcs)
@@ -120,6 +121,11 @@ function(op_library TARGET)
                list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
            endif()
        endif()
+        if(WITH_XPU_KP)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
+                list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
+            endif()
+        endif()
        if(WITH_ASCEND_CL)
            string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
@@ -154,6 +160,8 @@ function(op_library TARGET)
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                list(APPEND xpu_cc_srcs ${src})
+            elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
+                list(APPEND xpu_kp_cc_srcs ${src})
            elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                list(APPEND npu_cc_srcs ${src})
            elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
@@ -161,11 +169,13 @@ function(op_library TARGET)
            elseif(${src} MATCHES ".*\\.cc$")
                list(APPEND cc_srcs ${src})
            else()
-                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu or .xpu")
            endif()
        endforeach()
    endif()
-
+    
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+    list(LENGTH xpu_kp_cc_srcs xpu_kp_cc_srcs_len)
    list(LENGTH cc_srcs cc_srcs_len)
    if (${cc_srcs_len} EQUAL 0)
        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
@@ -231,6 +241,8 @@ function(op_library TARGET)
        list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
+    elseif (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
+        xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
    else()
        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
        if(WITH_UNITY_BUILD AND op_library_UNITY)
@@ -359,6 +371,11 @@ function(op_library TARGET)
    endif()
    endif()

+    # pybind USE_OP_DEVICE_KERNEL for XPU KP
+    if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
+    endif()
+
    # pybind USE_OP_DEVICE_KERNEL for NPU
    if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
        foreach(npu_src ${npu_cc_srcs})
@@ -438,7 +455,6 @@ function(op_library TARGET)
    endif()
 endfunction()

-
 function(register_operators)
    set(options "")
    set(oneValueArgs "")

--- a/cmake/pten.cmake
+++ b/cmake/pten.cmake
@@ -88,11 +88,12 @@ function(kernel_library TARGET)
    set(cpu_srcs)
    set(gpu_srcs)
    set(xpu_srcs)
+    set(selected_rows_srcs)
    # parse and save the deps kerenl targets
    set(all_srcs)
    set(kernel_deps)

-    set(oneValueArgs "")
+    set(oneValueArgs SUB_DIR)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
        "${multiValueArgs}" ${ARGN})
@@ -106,6 +107,9 @@ function(kernel_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
            list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
+            list(APPEND selected_rows_srcs ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
+        endif()
        if (WITH_GPU OR WITH_ROCM)
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
@@ -131,8 +135,17 @@ function(kernel_library TARGET)
    foreach(src ${all_srcs})
        file(READ ${src} target_content)
        string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
+        if ("${kernel_library_SUB_DIR}" STREQUAL "")
+            string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
+        else()
+            string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
+        endif()
        foreach(include_kernel ${include_kernels})
+        if ("${kernel_library_SUB_DIR}" STREQUAL "")
            string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/" "" kernel_name ${include_kernel})
+        else()
+            string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
+        endif()
            string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
            list(APPEND kernel_deps ${kernel_name})
        endforeach()
@@ -144,27 +157,30 @@ function(kernel_library TARGET)
    list(LENGTH cpu_srcs cpu_srcs_len)
    list(LENGTH gpu_srcs gpu_srcs_len)
    list(LENGTH xpu_srcs xpu_srcs_len)
+    list(LENGTH selected_rows_srcs selected_rows_srcs_len)

    # Build Target according different src organization
    if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0) AND ${common_srcs_len} GREATER 0)
-        # If the common_srcs depends on specific device srcs, build target using this rule.
+        ${xpu_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR 
+        ${selected_rows_srcs_len} GREATER 0))
+        # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
        if (WITH_GPU)
            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+                nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
            endif()
        elseif (WITH_ROCM)
            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+                hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
            endif()
        else()
            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
                cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+                cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
            endif()
        endif()
+    # If there are only specific device srcs, build target using this rule.
    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
        if (WITH_GPU)
            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
@@ -179,25 +195,42 @@ function(kernel_library TARGET)
                cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
            endif()
        endif()
-    else()
-        if (${common_srcs_len} EQUAL 0)
-             message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+    # If the selected_rows_srcs depends on common_srcs, build target using this rule.
+    elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0)
+        if (WITH_GPU)
+            nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+        elseif (WITH_ROCM)
+            hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
        else()
-            # If the kernel has a device independent public implementation,
-            # we will use this implementation and will not adopt the implementation
-            # under specific devices
-            if (WITH_GPU)
-                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            elseif (WITH_ROCM)
-                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            else()
-                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-         endif()
+            cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+        endif()
+    # If there are only common_srcs or selected_rows_srcs, build target using below rules.
+    elseif (${common_srcs_len} GREATER 0)
+        if (WITH_GPU)
+            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_ROCM)
+            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        else()
+            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        endif()
+    elseif (${selected_rows_srcs_len} GREATER 0)
+        if (WITH_GPU)
+            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_ROCM)
+            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        else()
+            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        endif()
+    else()
+         message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
    endif()

    if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
-        ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+        ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR 
+        ${selected_rows_srcs_len} GREATER 0)
        # append target into PTEN_KERNELS property
        get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
        set(pten_kernels ${pten_kernels} ${TARGET})
@@ -219,11 +252,14 @@ function(kernel_library TARGET)
    if (${xpu_srcs_len} GREATER 0)
        kernel_declare(${xpu_srcs})
    endif()
+    if (${selected_rows_srcs_len} GREATER 0)
+        kernel_declare(${selected_rows_srcs})
+    endif()
 endfunction()

 function(register_kernels)
    set(options "")
-    set(oneValueArgs "")
+    set(oneValueArgs SUB_DIR)
    set(multiValueArgs EXCLUDES DEPS)
    cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}"
        "${multiValueArgs}" ${ARGN})
@@ -236,9 +272,9 @@ function(register_kernels)
        list(FIND register_kernels_EXCLUDES ${target} _index)
        if (${_index} EQUAL -1)
            if (${register_kernels_DEPS_len} GREATER 0)
-                kernel_library(${target} DEPS ${register_kernels_DEPS})
+                kernel_library(${target} DEPS ${register_kernels_DEPS} SUB_DIR ${register_kernels_SUB_DIR})
            else()
-                kernel_library(${target})
+                kernel_library(${target} SUB_DIR ${register_kernels_SUB_DIR})
            endif()
        endif()
    endforeach()
@@ -246,9 +282,9 @@ endfunction()

 function(append_op_util_declare TARGET)
    file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
-    string(REGEX MATCH "(PT_REGISTER_API_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
+    string(REGEX MATCH "(PT_REGISTER_BASE_KERNEL_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
    string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
-    string(REPLACE "PT_REGISTER_API_NAME" "PT_REGISTER_API_NAME" util_declare "${util_declare}")
+    string(REPLACE "PT_REGISTER_BASE_KERNEL_NAME" "PT_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
    string(APPEND util_declare ");")
    file(APPEND ${op_utils_header} "${util_declare}")
 endfunction()

--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_XPU_KP)
+    return()
+endif()
+
+if(NOT XPU_TOOLCHAIN)
+  set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
+  get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
+endif()
+if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
+  message(FATAL_ERROR "Directory ${XPU_TOOLCHAIN} not found!")
+endif()
+message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN})
+set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++)
+message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
+
+# The host sysroot of XPU compiler is gcc-8.2 
+if(NOT HOST_SYSROOT)
+  set(HOST_SYSROOT /opt/compiler/gcc-8.2)
+endif()
+
+if(NOT IS_DIRECTORY ${HOST_SYSROOT})
+  message(FATAL_ERROR "Directory ${HOST_SYSROOT} not found!")
+endif()
+
+if(NOT API_ARCH)
+  set(API_ARCH x86_64-baidu-linux-gnu)
+endif()
+
+if(API_ARCH MATCHES "x86_64")
+if(EXISTS ${HOST_SYSROOT}/bin/g++)
+  set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
+  set(HOST_AR ${HOST_SYSROOT}/bin/ar)
+else()
+  set(HOST_CXX /usr/bin/g++)
+  set(HOST_AR /usr/bin/ar)
+endif()
+else()
+  set(HOST_CXX ${CMAKE_CXX_COMPILER})
+  set(HOST_AR ${CMAKE_AR})
+endif()
+
+set(TOOLCHAIN_ARGS )
+
+if(OPT_LEVEL)
+  set(OPT_LEVEL ${OPT_LEVEL})
+else()
+  set(OPT_LEVEL "-O3")
+endif()
+
+message(STATUS "Build with API_ARCH=" ${API_ARCH})
+message(STATUS "Build with TOOLCHAIN_ARGS=" ${TOOLCHAIN_ARGS})
+message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT})
+message(STATUS "Build with HOST_CXX=" ${HOST_CXX})
+message(STATUS "Build with HOST_AR=" ${HOST_AR})
+
+macro(compile_kernel COMPILE_ARGS)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs KERNEL DIRPATH XNAME DEVICE HOST XPU DEPENDS)
+  cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(kernel_path ${xpu_add_library_DIRPATH})
+  set(kernel_name ${xpu_add_library_XNAME})
+  set(device_o_extra_flags ${xpu_add_library_DEVICE})
+  set(host_o_extra_flags ${xpu_add_library_HOST})
+  set(xpu_1_or_2 ${xpu_add_library_XPU})
+  set(cc_depends ${xpu_add_library_DEPENDS})
+
+  set(kernel_target ${kernel_name}_kernel)
+  add_custom_target(${kernel_target}
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      kernel_build/${kernel_name}.host.o
+      kernel_build/${kernel_name}.bin.o
+    COMMENT
+      ${kernel_target}
+    VERBATIM
+    )
+
+  if(cc_depends)
+    add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS})
+  endif()
+
+  set(arg_device_o_extra_flags ${device_o_extra_flags})
+  separate_arguments(arg_device_o_extra_flags)
+  set(arg_host_o_extra_flags ${host_o_extra_flags})
+  separate_arguments(arg_host_o_extra_flags)
+
+  set(XTDK_DIR ${XPU_TOOLCHAIN})
+  set(CXX_DIR ${HOST_SYSROOT})
+  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
+
+  #include path
+  get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
+  set(XPU_CXX_INCLUDES "")
+  foreach(dir IN LISTS dirs)
+    list(APPEND XPU_CXX_INCLUDES "-I${dir}")
+  endforeach()
+  string(REPLACE ";" " " XPU_CXX_INCLUDES "${XPU_CXX_INCLUDES}" )
+  separate_arguments(XPU_CXX_INCLUDES UNIX_COMMAND "${XPU_CXX_INCLUDES}")
+
+  #related flags
+  get_directory_property( DirDefs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS )
+  set(XPU_CXX_DEFINES "")
+  foreach(def IN LISTS DirDefs)
+    list(APPEND XPU_CXX_DEFINES "-D${def}")
+  endforeach()
+  string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" )
+  separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}")
+
+  add_custom_command(
+    OUTPUT
+      kernel_build/${kernel_name}.bin.o
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
+       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
+        --xpu-device-only -c -v 
+    COMMAND
+      ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      ${xpu_add_library_DEPENDS}
+    COMMENT
+      kernel_build/${kernel_name}.bin.o
+    VERBATIM
+    )
+    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
+
+  add_custom_command(
+    OUTPUT
+      kernel_build/${kernel_name}.host.o
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
+        -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
+        --xpu-host-only -c -v 
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      ${xpu_add_library_DEPENDS}
+    COMMENT
+      kernel_build/${kernel_name}.host.o
+    VERBATIM
+    )
+    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
+endmacro()
+
+###############################################################################
+# XPU_ADD_LIBRARY
+###############################################################################
+macro(xpu_add_library TARGET_NAME)
+    # Separate the sources from the options
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs STATIC DEPENDS)
+    cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(xpu_srcs ${xpu_add_library_STATIC})
+    set(xpu_target ${TARGET_NAME})
+    set(cc_srcs_depends ${xpu_add_library_DEPENDS})
+    
+    file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
+    list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
+
+    set(XPU1_DEVICE_O_EXTRA_FLAGS " ")
+    set(XPU1_HOST_O_EXTRA_FLAGS " ")
+
+    # Distinguish .xpu file from other files
+    foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
+      get_filename_component(language_type_name ${cur_xpu_src} EXT)
+      if(${language_type_name} STREQUAL ".xpu")
+        list(APPEND xpu_kernel_lists ${cur_xpu_src})
+      else()
+        list(APPEND cc_kernel_lists ${cur_xpu_src})
+      endif()
+    endforeach()
+
+    # Ensure that there is only one xpu kernel
+    list(LENGTH xpu_kernel_lists xpu_kernel_lists_num)
+    list(LENGTH cc_srcs_depends cc_srcs_depends_num)
+
+    if(${xpu_kernel_lists_num})
+        foreach(xpu_kernel IN LISTS xpu_kernel_lists)
+            get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
+            get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
+            set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
+            set(kernel_name ${kernel_name})
+            compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
+        endforeach()
+
+        add_custom_target(${xpu_target}_src ALL
+            WORKING_DIRECTORY
+                ${CMAKE_CURRENT_BINARY_DIR}
+            DEPENDS
+                ${xpu_kernel_depends}
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            COMMENT
+                ${xpu_target}_src
+            VERBATIM
+            )
+
+        add_custom_command(
+            OUTPUT
+            ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            COMMAND
+                ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
+            WORKING_DIRECTORY
+                ${CMAKE_CURRENT_BINARY_DIR}
+            DEPENDS
+                ${xpu_kernel_depends}
+            COMMENT
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            VERBATIM
+            ) 
+        
+        add_library(${xpu_target} STATIC ${cc_kernel_lists})
+        add_dependencies(${xpu_target} ${xpu_target}_src)
+        target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
+    else()
+        add_library(${xpu_target} STATIC ${cc_kernel_lists})
+    endif()
+endmacro()
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -33,7 +33,7 @@ namespace distributed {
 template <typename T>
 inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
 GetBlas() {
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
  return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
                                          T>(cpu_ctx);
 }

--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -213,6 +213,7 @@ int32_t BrpcPsClient::initialize() {
  auto &profiler = CostProfiler::instance();
  profiler.register_profiler("pserver_client_pull_dense");
  profiler.register_profiler("pserver_client_pull_sparse");
+  profiler.register_profiler("pserver_client_pull_sparse_param");
  profiler.register_profiler("pserver_client_pull_sparse_local");
  profiler.register_profiler("pserver_client_push_sparse");
  profiler.register_profiler("pserver_client_push_sparse_parse");
@@ -543,6 +544,7 @@ std::future<int32_t> BrpcPsClient::pull_geo_param(size_t table_id,
  return fut;
 }

+// for GEO
 std::future<int32_t> BrpcPsClient::push_sparse_param(
    size_t table_id, const uint64_t *keys, const float **update_values,
    size_t num, void *done) {
@@ -558,18 +560,8 @@ std::future<int32_t> BrpcPsClient::push_sparse_param(
  ids.resize(request_call_num);
  value_ptrs.resize(request_call_num);

-  const auto &server_param = _config.server_param().downpour_server_param();
-  uint64_t shard_num = FLAGS_pserver_sparse_table_shard_num;
-  for (int i = 0; i < server_param.downpour_table_param_size(); ++i) {
-    const auto &table_param = server_param.downpour_table_param(i);
-    if (table_param.table_id() == table_id) {
-      shard_num = table_param.shard_num();
-      break;
-    }
-  }
-
  for (size_t i = 0; i < num; ++i) {
-    size_t pserver_idx = get_sparse_shard(shard_num, request_call_num, keys[i]);
+    size_t pserver_idx = keys[i] % request_call_num;
    ids[pserver_idx].push_back(keys[i]);
    value_ptrs[pserver_idx].push_back(update_values[i]);
  }
@@ -1003,6 +995,120 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
  return fut;
 }

+// for GEO
+std::future<int32_t> BrpcPsClient::pull_sparse_param(float **select_values,
+                                                     size_t table_id,
+                                                     const uint64_t *keys,
+                                                     size_t num,
+                                                     bool is_training) {
+  auto timer = std::make_shared<CostTimer>("pserver_client_pull_sparse_param");
+  size_t request_call_num = _server_channels.size();
+
+  auto shard_sorted_kvs = std::make_shared<
+      std::vector<std::vector<std::pair<uint64_t, float *>>>>();
+  shard_sorted_kvs->resize(request_call_num);
+
+  for (size_t i = 0; i < num; ++i) {
+    size_t shard_id = keys[i] % request_call_num;
+    shard_sorted_kvs->at(shard_id).push_back({keys[i], select_values[i]});
+  }
+
+  auto *accessor = table_accessor(table_id);
+  size_t value_size = accessor->select_size();
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [shard_sorted_kvs, value_size](void *done) {
+        int ret = 0;
+        auto *closure = reinterpret_cast<DownpourBrpcClosure *>(done);
+        for (size_t i = 0; i < shard_sorted_kvs->size(); ++i) {
+          if (closure->check_response(i, PS_PULL_SPARSE_TABLE) != 0) {
+            ret = -1;
+            break;
+          }
+
+          auto &request_kvs = shard_sorted_kvs->at(i);
+          auto &res_io_buffer = closure->cntl(i)->response_attachment();
+          butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+          uint64_t last_key = UINT64_MAX;
+          float *last_value_data = NULL;
+
+          // can remove sort&unique
+          for (size_t kv_idx = 0; kv_idx < request_kvs.size(); ++kv_idx) {
+            auto *kv_pair = &(request_kvs[kv_idx]);
+            if (kv_pair->first == last_key) {
+              memcpy(reinterpret_cast<void *>(kv_pair->second),
+                     reinterpret_cast<void *>(last_value_data), value_size);
+            } else {
+              last_key = kv_pair->first;
+              last_value_data = kv_pair->second;
+              if (value_size !=
+                  io_buffer_itr.copy_and_forward(
+                      reinterpret_cast<void *>(last_value_data), value_size)) {
+                LOG(WARNING) << "res data is lack or not in format";
+                ret = -1;
+                break;
+              }
+            }
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  closure->add_timer(timer);
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (size_t i = 0; i < request_call_num; ++i) {
+    auto &sorted_kvs = shard_sorted_kvs->at(i);
+    std::sort(sorted_kvs.begin(), sorted_kvs.end(),
+              [](const std::pair<uint64_t, float *> &k1,
+                 const std::pair<uint64_t, float *> &k2) {
+                return k1.first < k2.first;
+              });
+
+    uint64_t last_key = UINT64_MAX;
+    uint32_t kv_request_count = 0;
+    size_t sorted_kv_size = sorted_kvs.size();
+    auto &request_buffer = closure->cntl(i)->request_attachment();
+
+    request_buffer.append(reinterpret_cast<void *>(&is_training), sizeof(bool));
+    std::vector<uint32_t> keys_counter;
+    keys_counter.reserve(sorted_kv_size);
+
+    for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) {
+      ++kv_request_count;
+      uint32_t keys = 1;
+      last_key = sorted_kvs[kv_idx].first;
+      request_buffer.append(reinterpret_cast<void *>(&last_key),
+                            sizeof(uint64_t));
+      while (kv_idx < sorted_kv_size - 1 &&
+             last_key == sorted_kvs[kv_idx + 1].first) {
+        ++kv_idx;
+        ++keys;
+      }
+      keys_counter.push_back(keys);
+    }
+
+    request_buffer.append(reinterpret_cast<void *>(keys_counter.data()),
+                          sizeof(uint32_t) * keys_counter.size());
+
+    if (kv_request_count == 0) {
+      closure->Run();
+    } else {
+      closure->request(i)->set_cmd_id(PS_PULL_SPARSE_TABLE);
+      closure->request(i)->set_table_id(table_id);
+      closure->request(i)->set_client_id(_client_id);
+      closure->request(i)->add_params((char *)&kv_request_count,  // NOLINT
+                                      sizeof(uint32_t));
+      PsService_Stub rpc_stub(get_cmd_channel(i));
+      closure->cntl(i)->set_log_id(butil::gettimeofday_ms());
+      rpc_stub.service(closure->cntl(i), closure->request(i),
+                       closure->response(i), closure);
+    }
+  }
+  return fut;
+}
+
 std::future<int32_t> BrpcPsClient::send_client2client_msg(
    int msg_type, int to_client_id, const std::string &msg) {
  auto promise = std::make_shared<std::promise<int32_t>>();
@@ -1067,12 +1173,14 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
  std::string var_name = "";
  int64_t var_num = 0;
  int64_t var_shape = 0;
+  std::string table_class;
  const auto &worker_param = _config.worker_param().downpour_worker_param();
  for (size_t i = 0; i < worker_param.downpour_table_param_size(); ++i) {
    if (worker_param.downpour_table_param(i).table_id() == table_id) {
      var_name = worker_param.downpour_table_param(i).common().table_name();
      var_num = worker_param.downpour_table_param(i).common().table_num();
      var_shape = worker_param.downpour_table_param(i).common().table_dim();
+      table_class = worker_param.downpour_table_param(i).table_class();
      break;
    }
  }
@@ -1094,9 +1202,19 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
    save_vec.push_back(save_huge_vec.data() + i * var_shape);
  }

-  auto status = pull_sparse(reinterpret_cast<float **>(save_vec.data()),
-                            table_id, save_key.data(), save_key.size(), true);
-  status.wait();
+  VLOG(2) << "recv_and_save_table: table_class: " << table_class;
+  // TODO(zhaocaibei123): new GeoBrpcPSClient, move this to its
+  // recv_and_save_table
+  if (table_class == "MemorySparseGeoTable") {
+    auto status =
+        pull_sparse_param(reinterpret_cast<float **>(save_vec.data()), table_id,
+                          save_key.data(), save_key.size(), true);
+    status.wait();
+  } else {
+    auto status = pull_sparse(reinterpret_cast<float **>(save_vec.data()),
+                              table_id, save_key.data(), save_key.size(), true);
+    status.wait();
+  }

  // create lod tensor
  std::shared_ptr<framework::Scope> scope;

--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -194,6 +194,10 @@ class BrpcPsClient : public PSClient {
                                           size_t table_id,
                                           const uint64_t *keys, size_t num,
                                           bool is_training);
+  virtual std::future<int32_t> pull_sparse_param(float **select_values,
+                                                 size_t table_id,
+                                                 const uint64_t *keys,
+                                                 size_t num, bool is_training);

  virtual std::future<int32_t> print_table_stat(uint32_t table_id);


--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -354,7 +354,7 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id,

  bool training = true;

-  auto status = _worker_ptr->pull_sparse(
+  auto status = _worker_ptr->pull_sparse_param(
      (float **)push_g_vec.data(), table_id,  // NOLINT
      sparse_push_keys.data(), sparse_push_keys.size(), training);
  status.wait();
@@ -1029,7 +1029,7 @@ void GeoCommunicator::Send(const std::vector<std::string> &var_names,
    auto &sparse_ids_set = iter.second;
    auto sparse_ids_vec = std::make_shared<std::vector<int64_t>>();
    sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end());
-    sparse_id_queues_.at(key)->Push(sparse_ids_vec);
+    sparse_id_queues_.at(key)->Put(sparse_ids_vec);
    VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key
            << "'s queue";
  }
@@ -1051,7 +1051,10 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,

  for (auto &iter : send_varname_to_ctx_) {
    auto &ctx = iter.second;
-    if (!ctx.is_sparse) continue;
+    if (!ctx.is_sparse) {
+      parallel_task_nums_ += 1;
+      continue;
+    }
    auto &varnames = ctx.origin_varnames;
    PADDLE_ENFORCE_EQ(
        varnames.size(), 1,
@@ -1060,12 +1063,11 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
    for (auto &splited_var : ctx.splited_varnames) {
      parallel_task_nums_ += 1;
      sparse_id_queues_.insert(
-          std::pair<std::string, std::shared_ptr<BlockingQueue<
-                                     std::shared_ptr<std::vector<int64_t>>>>>(
+          std::pair<std::string, paddle::framework::Channel<
+                                     std::shared_ptr<std::vector<int64_t>>>>(
              splited_var,
-              std::make_shared<
-                  BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>(
-                  send_queue_size_)));
+              paddle::framework::MakeChannel<
+                  std::shared_ptr<std::vector<int64_t>>>(send_queue_size_)));
    }
  }

@@ -1153,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
    auto &t_latest = var_latest->Get<framework::LoDTensor>();
    auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();

-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
    auto *var_delta = delta_scope_->Var(varname);
    auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
    t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
@@ -1183,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
  RpcRecvDense(varnames, table_id, pserver_scope_.get());

  // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
  for (auto &varname : varnames) {
    auto *var_latest = recv_scope_->FindVar(varname);
    auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1242,8 +1244,8 @@ std::vector<int64_t> GeoCommunicator::MergeSparseIds(
    VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num;
    if (sparse_id_queues_.at(send_varname)->Size() > 0) {
      wait_times = 0;
-      std::shared_ptr<std::vector<int64_t>> pop_ids =
-          sparse_id_queues_.at(send_varname)->Pop();
+      std::shared_ptr<std::vector<int64_t>> pop_ids = nullptr;
+      sparse_id_queues_.at(send_varname)->Get(pop_ids);
      for (size_t j = 0; j < pop_ids->size(); j++) {
        sparse_ids.insert(pop_ids->at(j));
      }
@@ -1268,6 +1270,9 @@ void GeoCommunicator::SendSparse(const std::string &varname,
                                 std::vector<int64_t> &sparse_ids, int table_id,
                                 int ep_idx) {
  platform::RecordEvent record_event("GeoCommunicator->SendSparse");
+  if (sparse_ids.size() == 0) {
+    return;
+  }
  std::string param_name = SplitedGradToParam(varname);
  VLOG(1) << "In GeoCommunicator::SendSparse(" << varname << " " << param_name
          << ", ids.size = " << sparse_ids.size() << ", table_id: " << table_id
@@ -1287,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
  auto *t_old = var_old->GetMutable<framework::LoDTensor>();

  auto dims1 = t_latest.dims()[1];
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;

  auto *var_delta = delta_scope_->Var(varname);
  auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
@@ -1313,6 +1318,10 @@ void GeoCommunicator::SendSparse(const std::string &varname,
              t_value + j * dims1,
              t_old->data<float>() + sparse_ids[j] * dims1);
    push_g_vec.push_back(t_value + j * dims1);
+
+    VLOG(5) << "DEBUG GeoCommunicator::SendSparse send sparse key "
+            << sparse_ids[j] << " value[0] " << push_g_vec[j][0]
+            << " value[-1] " << push_g_vec[j][dims1 - 1];
  }

  ++_async_call_num;
@@ -1361,12 +1370,15 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
  std::vector<float> v_delta;
  v_delta.resize(numel);

-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
  auto blas =
      paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
          cpu_ctx);

  for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
+    VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j]
+            << "value[0] " << values[j * dims1] << " value[-1] "
+            << values[j * dims1 + dims1 - 1];
    float *latest_data = t_latest->data<float>() + keys[j] * dims1;
    float *old_data = t_old->data<float>() + keys[j] * dims1;
    // pserver - old => delta

--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -30,6 +30,7 @@ limitations under the License. */

 #include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
+#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -178,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
    }

    // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
    paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
        constant_functor;
    constant_functor(cpu_ctx, out_t, static_cast<T>(0));
@@ -203,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
    for (auto &var : vars) {
      inputs.push_back(&var->Get<pten::SelectedRows>());
    }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext dev_ctx;
    if (merge_add) {
      paddle::operators::math::scatter::MergeAdd<
          paddle::platform::CPUDeviceContext, T>
@@ -626,9 +627,8 @@ class GeoCommunicator : public AsyncCommunicator {
  // parameter on pserver
  std::shared_ptr<Scope> pserver_scope_;

-  std::unordered_map<
-      std::string,
-      std::shared_ptr<BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>>
+  std::unordered_map<std::string, paddle::framework::Channel<
+                                      std::shared_ptr<std::vector<int64_t>>>>
      sparse_id_queues_;
 };


--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -128,6 +128,17 @@ class PSClient {
                                           const uint64_t *keys, size_t num,
                                           bool is_training) = 0;

+  virtual std::future<int32_t> pull_sparse_param(float **select_values,
+                                                 size_t table_id,
+                                                 const uint64_t *keys,
+                                                 size_t num, bool is_training) {
+    VLOG(0) << "Did not implement";
+    std::promise<int32_t> promise;
+    std::future<int> fut = promise.get_future();
+    promise.set_value(-1);
+    return fut;
+  }
+
  virtual ::std::future<int32_t> pull_sparse_ptr(char **select_values,
                                                 size_t table_id,
                                                 const uint64_t *keys,

--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -47,6 +47,9 @@ cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framewo
 cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
 cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)

-cc_library(table SRCS table.cc DEPS memory_sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
+set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table)
+
+cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)

 target_link_libraries(table -fopenmp)
--- a/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
+++ b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
@@ -15,13 +15,9 @@
 #pragma once

 #include <ThreadPool.h>
-#include <functional>
 #include <future>  // NOLINT
 #include <memory>
-#include <string>
-#include <unordered_map>
 #include <unordered_set>
-#include <utility>
 #include <vector>

 namespace paddle {

--- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
+
+namespace paddle {
+namespace distributed {
+
+int32_t MemorySparseGeoTable::push_sparse_param(const uint64_t* keys,
+                                                const float* values,
+                                                size_t num) {
+  VLOG(5) << "DEBUG MemorySparseGeoTable::push_sparse_param begin "
+             "push_sparse_param "
+          << num;
+  auto shard_num = _task_pool_size;
+  std::vector<std::vector<uint64_t>> offset_bucket;
+  offset_bucket.resize(shard_num);
+
+  for (int x = 0; x < num; ++x) {
+    auto y = keys[x] % shard_num;
+    offset_bucket[y].push_back(x);
+    if (x < 10) {
+      VLOG(5) << "DEBUG MemorySparseGeoTable::push_sparse_param key: "
+              << keys[x] << " shard: " << y;
+    }
+  }
+
+  std::vector<std::future<int>> tasks(shard_num);
+
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &keys, &offset_bucket, &values]() -> int {
+          auto& local_shard = _local_shards[shard_id];
+          auto& offsets = offset_bucket[shard_id];
+
+          for (int i = 0; i < offsets.size(); ++i) {
+            auto offset = offsets[i];
+            auto id = keys[offset];
+            auto& feature_value = local_shard[id];
+            feature_value.resize(_dim);
+            std::copy_n(values + _dim * offset, _dim, feature_value.data());
+            if (i < 10) {
+              VLOG(5) << "MemorySparseGeoTable::push_sparse_param "
+                         "push_sparse_param key "
+                      << id << " value[0]: " << (values + _dim * offset)[0]
+                      << " data: " << feature_value.data()[0]
+                      << " value[-1]: " << (values + _dim * offset)[_dim - 1]
+                      << " data: " << feature_value.data()[_dim - 1];
+            }
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::pull_geo_param(const uint32_t trainer_id,
+                                             std::vector<float>* values,
+                                             std::vector<uint64_t>* ids) {
+  _geo_recorder->GetAndClear(trainer_id, ids);
+  VLOG(5)
+      << "DEBUG MemorySparseGeoTable::pull_geo_param pull_geo_param trainer_id "
+      << trainer_id << " id_num: " << ids->size();
+
+  std::vector<uint32_t> frequencies;
+  frequencies.resize(ids->size(), 1);
+
+  auto pull_value = PullSparseValue(ids->size(), _dim);
+  pull_value.is_training_ = true;
+  pull_value.feasigns_ = ids->data();
+  pull_value.frequencies_ = frequencies.data();
+
+  values->resize(ids->size() * _dim);
+  pull_sparse(values->data(), pull_value);
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::push_sparse(const uint64_t* keys,
+                                          const float* values, size_t num) {
+  VLOG(5) << "DEBUG MemorySparseGeoTable::push_sparse keys[0]" << keys[0]
+          << " key_num: " << num;
+  std::vector<uint64_t> ids;
+  ids.resize(num);
+  std::copy_n(keys, num, ids.begin());
+  _geo_recorder->Update(ids);
+  _push_sparse(keys, values, num);
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::initialize() {
+  if (!_geo_recorder) {
+    auto trainers = _config.common().trainer_num();
+    _geo_recorder = std::make_shared<GeoRecorder>(trainers);
+  }
+
+  _dim = _config.common().dims()[0];
+  _shards_task_pool.resize(_task_pool_size);
+  for (int i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
+  }
+
+  _local_shards.reset(new shard_type[_task_pool_size]);
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::pull_sparse(float* pull_values,
+                                          const PullSparseValue& pull_value) {
+  auto shard_num = _task_pool_size;
+  std::vector<std::future<int>> tasks(shard_num);
+
+  std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(shard_num);
+  size_t num = pull_value.numel_;
+  for (size_t i = 0; i < num; ++i) {
+    int shard_id = pull_value.feasigns_[i] % shard_num;
+    task_keys[shard_id].push_back({pull_value.feasigns_[i], i});
+  }
+
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &task_keys, pull_values]() -> int {
+          auto& local_shard = _local_shards[shard_id];
+          auto& keys = task_keys[shard_id];
+          for (size_t i = 0; i < keys.size(); i++) {
+            uint64_t key = keys[i].first;
+            auto offset = keys[i].second;
+            float* select_data = pull_values + _dim * offset;
+
+            auto itr = local_shard.find(key);
+            if (itr == local_shard.end()) {
+              // ++missed_keys;
+              auto& feature_value = local_shard[key];
+              feature_value.resize(_dim);
+              memset(feature_value.data(), 0, sizeof(float) * _dim);
+              VLOG(0) << "MemorySparseGeoTable pull_sparse key not found!!! "
+                      << key;
+              itr = local_shard.find(key);
+            }
+            memcpy(select_data, itr.value().data(), _dim * sizeof(float));
+
+            VLOG(5) << "DEBUG MemorySparseGeoTable::pull_sparse key: " << key
+                    << " select_data[0] " << select_data[0]
+                    << " value[0]: " << itr.value().data()[0];
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::_push_sparse(const uint64_t* keys,
+                                           const float* values, size_t num) {
+  auto shard_num = _task_pool_size;
+  std::vector<std::future<int>> tasks(shard_num);
+  std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(shard_num);
+  for (size_t i = 0; i < num; ++i) {
+    int shard_id = keys[i] % shard_num;
+    task_keys[shard_id].push_back({keys[i], i});
+  }
+
+  for (size_t shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, values, &task_keys]() -> int {
+          auto& keys = task_keys[shard_id];
+          auto& local_shard = _local_shards[shard_id];
+          auto blas = GetBlas<float>();
+
+          for (int i = 0; i < keys.size(); ++i) {
+            uint64_t key = keys[i].first;
+            uint64_t push_data_idx = keys[i].second;
+            const float* update_data = values + push_data_idx * _dim;
+            auto itr = local_shard.find(key);
+            if (itr == local_shard.end()) {
+              VLOG(0) << "sparse geo table push not found key!!! " << key;
+              auto& feature_value = local_shard[key];
+              feature_value.resize(_dim);
+              memset(feature_value.data(), 0, sizeof(float) * _dim);
+              itr = local_shard.find(key);
+            }
+
+            auto& feature_value = itr.value();
+            float* value_data = feature_value.data();
+            VLOG(5) << "DEBUG MemorySparseGeoTable::_push_sparse before key: "
+                    << key << " update_data[0] " << update_data[0]
+                    << " value[0]: " << value_data[0];
+            blas.VADD(_dim, update_data, value_data, value_data);
+            VLOG(5) << "DEBUG MemorySparseGeoTable::_push_sparse after key: "
+                    << key << " value[0]: " << value_data[0];
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <assert.h>
+// #include <pthread.h>
+#include <stdint.h>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+#include "paddle/fluid/distributed/ps/table/depends/geo_recorder.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace distributed {
+
+class GeoRecorder;
+
+class MemorySparseGeoTable : public SparseTable {
+ public:
+  typedef SparseTableShard<uint64_t, FixedFeatureValue> shard_type;
+  MemorySparseGeoTable() { _geo_recorder = nullptr; }
+  virtual ~MemorySparseGeoTable() {}
+
+  virtual int32_t initialize();
+  virtual int32_t initialize_shard() { return 0; }
+  virtual int32_t load(const std::string& path, const std::string& param) {
+    return 0;
+  }
+  virtual int32_t save(const std::string& path, const std::string& param) {
+    return 0;
+  }
+  virtual int32_t flush() { return 0; }
+  virtual int32_t shrink(const std::string& param) { return 0; }
+  virtual void clear() { return; }
+  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
+
+  int32_t push_sparse_param(const uint64_t* keys, const float* values,
+                            size_t num);
+  // TODO(zhaocaibei123): change to pull_sparse, and rename pull_sparse
+  int32_t pull_geo_param(const uint32_t trainer_id, std::vector<float>* values,
+                         std::vector<uint64_t>* keys);
+
+  int32_t push_sparse(const uint64_t* keys, const float* values,
+                      size_t num) override;
+
+  int32_t _push_sparse(const uint64_t* keys, const float* values, size_t num);
+  // int32_t _pull_sparse(float* pull_values, const PullSparseValue&
+  // pull_value);
+
+ private:
+  std::shared_ptr<GeoRecorder> _geo_recorder;
+  const int _task_pool_size = 10;
+  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
+  std::unique_ptr<shard_type[]> _local_shards;
+  int _dim;
+};
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/distributed/ps/table/common_dense_table.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
 #include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h"
@@ -43,6 +44,7 @@ REGISTER_PSCORE_CLASS(Table, TensorTable);
 REGISTER_PSCORE_CLASS(Table, DenseTensorTable);
 REGISTER_PSCORE_CLASS(Table, GlobalStepTable);
 REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
+REGISTER_PSCORE_CLASS(Table, MemorySparseGeoTable);
 REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor);
 REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);

--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -35,3 +35,6 @@ cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost ta

 set_source_files_properties(memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS ${COMMON_DEPS} boost table)
+
+set_source_files_properties(memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(memory_sparse_geo_table_test SRCS memory_geo_table_test.cc DEPS ${COMMON_DEPS} boost table)
--- a/paddle/fluid/distributed/test/memory_geo_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_geo_table_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ThreadPool.h>
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
+#include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
+
+namespace paddle {
+namespace distributed {
+
+// MemorySparseGeoTable
+TEST(MemorySparseGeoTable, SSUM) {
+  int emb_dim = 10;
+  int trainers = 2;
+
+  TableParameter table_config;
+  table_config.set_table_class("MemorySparseGeoTable");
+  FsClientParameter fs_config;
+  Table *table = new MemorySparseGeoTable();
+  TableAccessorParameter *accessor_config = table_config.mutable_accessor();
+  accessor_config->set_accessor_class("CommMergeAccessor");
+  accessor_config->set_fea_dim(10);
+  CommonAccessorParameter *common_config = table_config.mutable_common();
+  common_config->set_name("sum");
+  common_config->set_table_name("ssum_test_table");
+  common_config->set_trainer_num(trainers);
+  common_config->add_params("Param");
+  common_config->add_dims(emb_dim);
+  common_config->add_initializers("fill_constant&1.0");
+
+  auto ret = table->initialize(table_config, fs_config);
+  ASSERT_EQ(ret, 0);
+
+  // test push_sparse_param, and create params
+  std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+  std::vector<float> init_values;
+  for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
+    init_values.push_back(0.0);
+  }
+  table->push_sparse_param(init_keys.data(), init_values.data(),
+                           init_keys.size());
+
+  std::vector<float> pull_values(init_values.size());
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(pull_values.data(), value);
+
+  for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
+    ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5);
+  }
+
+  std::vector<std::vector<uint64_t>> trainer_keys;
+  std::vector<std::vector<float>> trainer_values;
+  trainer_keys.resize(trainers);
+  trainer_values.resize(trainers);
+  float start = 0.0;
+  for (int i = 0; i < trainers; i++) {
+    trainer_keys[i] = init_keys;
+    for (size_t j = 0; j < trainer_keys[i].size(); j++) {
+      auto id = trainer_keys[i][j];
+      for (int k = 0; k < emb_dim; k++) {
+        trainer_values[i].push_back(start);
+        pull_values[id * emb_dim + k] += start;
+        start += 0.1;
+      }
+    }
+  }
+
+  std::shared_ptr<::ThreadPool> pool_ =
+      std::make_shared<::ThreadPool>(trainers);
+  std::vector<std::future<void>> task_status;
+  for (int i = 0; i < trainers; i++) {
+    auto &push_keys = trainer_keys[i];
+    auto &push_values = trainer_values[i];
+    auto task = [table, &push_keys, &push_values] {
+      table->push_sparse(push_keys.data(), push_values.data(),
+                         push_keys.size());
+    };
+    task_status.push_back(pool_->enqueue(std::move(task)));
+  }
+  for (auto &status : task_status) {
+    status.wait();
+  }
+
+  std::vector<std::vector<uint64_t>> geo_pull_ids;
+  std::vector<std::vector<float>> geo_pull_values;
+  geo_pull_ids.resize(trainers);
+  geo_pull_values.resize(trainers);
+  for (int i = 0; i < trainers; i++) {
+    table->pull_geo_param(i, &geo_pull_values[i], &geo_pull_ids[i]);
+    ASSERT_EQ(geo_pull_values[i].size(), geo_pull_ids[i].size() * emb_dim);
+    for (size_t j = 0; j < geo_pull_ids[i].size(); ++j) {
+      auto id = geo_pull_ids[i][j];
+      for (int k = 0; k < emb_dim; k++) {
+        ASSERT_TRUE(abs(geo_pull_values[i][j * emb_dim + k] -
+                        pull_values[id * emb_dim + k]) < 1e-5);
+      }
+    }
+  }
+}
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
-set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor legacy autograd_meta grad_node_info grad_tensor_holder gradient_accumulation accumulation_node)
+set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps dygraph_function dygraph_node)

@@ -9,14 +9,12 @@ endif()

 add_subdirectory(api)
 add_subdirectory(accumulation)
-add_subdirectory(legacy)

 cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
-cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulation)
+cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)

 cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api)
 cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
-cc_library(legacy SRCS ${DYGRAPH_LEGACY} DEPS global_utils proto_desc operator pten pten_api op_registry variable_helper memcpy)
 cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)

 add_subdirectory(tests)
--- a/paddle/fluid/eager/accumulation/CMakeLists.txt
+++ b/paddle/fluid/eager/accumulation/CMakeLists.txt
-cc_library(gradient_accumulation SRCS gradient_accumulation.cc DEPS blas pten pten_api var_type_traits layer math_function)
-cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulation pten pten_api grad_node_info)
+cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator pten pten_api grad_node_info)
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -13,8 +13,8 @@
 // limitations under the License.

 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
-#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
 #include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/imperative/gradient_accumulator.h"

 #include "paddle/pten/api/all.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -35,7 +35,7 @@ static void CopyOrAddTensor(egr::EagerTensor* tensor,
    *tensor = t;
  } else {
    // Accumulation
-    egr::TensorAdd(t, tensor);
+    paddle::imperative::TensorAdd<egr::EagerTensor>(t, tensor);
  }
 }


--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
-#include <algorithm>
-#include <memory>
-#include <utility>
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/imperative/gradient_accumulator.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/api/all.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-#ifdef PADDLE_WITH_XPU
-#include "xpu/refactor/math.h"
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
-#endif
-
-namespace egr {
-template <typename T>
-class TensorAddFunctor : public boost::static_visitor<> {
- public:
-  TensorAddFunctor(int64_t numel, const T* x, T* y)
-      : numel_(numel), x_(x), y_(y) {}
-
-  void operator()(const paddle::platform::CPUPlace& place) const {
-    paddle::platform::CPUDeviceContext* ctx =
-        dynamic_cast<paddle::platform::CPUDeviceContext*>(
-            paddle::platform::DeviceContextPool::Instance().Get(place));
-    auto blas =
-        paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext, T>(
-            *ctx);
-    blas.AXPY(numel_, 1., x_, y_);
-  }
-
-// TODO(jiabin): Support xpu here from gradient_accumulator.cc
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void operator()(const paddle::platform::CUDAPlace& place) const {
-    paddle::platform::CUDADeviceContext* ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(
-            paddle::platform::DeviceContextPool::Instance().Get(place));
-    auto blas =
-        paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext,
-                                         T>(*ctx);
-    blas.AXPY(numel_, 1., x_, y_);
-  }
-#else
-  void operator()(const paddle::platform::CUDAPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-  // TODO(jiabin): Support Npu here from gradient_accumulator.cc
-  // there is NO blas in CUDAPinnedPlace
-  void operator()(const paddle::platform::CUDAPinnedPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-
-#ifdef PADDLE_WITH_ASCEND_CL
-  void operator()(const paddle::platform::NPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#else
-  void operator()(const paddle::platform::NPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-#ifdef PADDLE_WITH_XPU
-  void operator()(const paddle::platform::XPUPlace& place) const {
-    paddle::platform::XPUDeviceContext* ctx =
-        dynamic_cast<paddle::platform::XPUDeviceContext*>(
-            paddle::platform::DeviceContextPool::Instance().Get(place));
-    xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
-  }
-#else
-  void operator()(const paddle::platform::XPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-#ifdef PADDLE_WITH_MLU
-  void operator()(const paddle::platform::MLUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#else
-  void operator()(const paddle::platform::MLUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-#ifdef PADDLE_WITH_IPU
-  void operator()(const paddle::platform::IPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#else
-  void operator()(const paddle::platform::IPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-  void operator()(const paddle::platform::NPUPinnedPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-
- private:
-  int64_t numel_;
-  const T* x_;
-  mutable T* y_;
-};
-
-template <typename DeviceContext, typename T>
-void TensorAddImpl(const std::shared_ptr<pten::DenseTensor>& src,
-                   pten::DenseTensor* dst,
-                   const paddle::platform::Place& place) {
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  paddle::platform::DeviceContext* ctx = pool.Get(place);
-  auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
-  paddle::operators::math::ElementwiseAddTo<DeviceContext, T> func;
-  func(dev_ctx, *(src.get()), dst);
-}
-
-template <typename DeviceContext, typename T>
-void TensorAddImpl(const paddle::framework::Tensor& src,
-                   paddle::framework::Tensor* dst,
-                   const paddle::platform::Place& place) {
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  paddle::platform::DeviceContext* ctx = pool.Get(place);
-  auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
-  paddle::operators::math::ElementwiseAddTo<DeviceContext, T> func;
-  func(dev_ctx, src, dst);
-}
-
-void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
-  // TODO(jiabin): Support other tensor type later
-  std::shared_ptr<pten::DenseTensor> dst_tensor =
-      std::dynamic_pointer_cast<pten::DenseTensor>(dst->impl());
-  std::shared_ptr<pten::DenseTensor> src_tensor =
-      std::dynamic_pointer_cast<pten::DenseTensor>(src.impl());
-
-  auto numel = src_tensor->numel();
-
-  if (numel == 0) {
-    return;
-  }
-
-  PADDLE_ENFORCE_EQ(
-      dst_tensor->numel(), numel,
-      paddle::platform::errors::PreconditionNotMet(
-          "The number of elements of source tensor and destination tensor "
-          "should be equal, but got the number of elements of source tensor is "
-          "%zu and the number of elements of destination tensor is %zu.",
-          numel, dst_tensor->numel()));
-
-  auto data_type = pten::TransToProtoVarType(src_tensor->dtype());
-  auto place = src_tensor->place();
-
-  PADDLE_ENFORCE_EQ(pten::TransToProtoVarType(dst_tensor->dtype()), data_type,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "The data type of source tensor and destination tensor "
-                        "should be equal, Otherwise, the calculation results "
-                        "will be incorrect."));
-
-#define PADDLE_TENSOR_ADD(cpp_type)                                          \
-  if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
-    TensorAddFunctor<cpp_type> func(                                         \
-        numel, src_tensor->data<cpp_type>(),                                 \
-        dst_tensor->mutable_data<cpp_type>(place));                          \
-    paddle::platform::VisitPlace(place, func);                               \
-    return;                                                                  \
-  }
-
-  // TODO(jiabin): Support NPU here
-  PADDLE_TENSOR_ADD(float);
-// NOTE(phlrain): xpu only support float
-#ifndef PADDLE_WITH_XPU
-  PADDLE_TENSOR_ADD(double);
-  // NOTE(chenweihang): only support complex grad tensor accumulated,
-  // support selected rows if needed in the future
-  PADDLE_TENSOR_ADD(paddle::platform::complex<float>);
-  PADDLE_TENSOR_ADD(paddle::platform::complex<double>);
-#endif
-#undef PADDLE_TENSOR_ADD
-
-  if (data_type == paddle::framework::proto::VarType::FP16) {
-    if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      return TensorAddImpl<paddle::platform::CUDADeviceContext,
-                           paddle::platform::float16>(src_tensor,
-                                                      dst_tensor.get(), place);
-#else
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Gradient accumulation of data type (%s) on place (%s) is not "
-          "supported in imperative mode",
-          paddle::framework::DataTypeToString(data_type), place));
-#endif
-    } else if (paddle::platform::is_cpu_place(place)) {
-      return TensorAddImpl<paddle::platform::CPUDeviceContext,
-                           paddle::platform::float16>(src_tensor,
-                                                      dst_tensor.get(), place);
-    }
-  }
-  PADDLE_THROW(paddle::platform::errors::Unimplemented(
-      "Gradient accumulation of data type (%s) on place (%s) is not "
-      "supported in imperative mode",
-      paddle::framework::DataTypeToString(data_type), place));
-}
-
-void VariableAdd(const egr::EagerTensor& src_tensor,
-                 egr::EagerTensor* dst_tensor) {
-  auto& src = src_tensor.Var();
-  auto* dst = dst_tensor->MutableVar();
-
-  if (dst->IsType<paddle::framework::LoDTensor>()) {
-    if (src.IsType<paddle::framework::LoDTensor>()) {
-      paddle::imperative::TensorAdd(src, dst);
-    } else if (src.IsType<pten::SelectedRows>()) {
-      paddle::imperative::SelectedRowsAddToTensor(src, dst);
-    } else {
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-          "Unexpected branch, output variable type is %s",
-          paddle::framework::ToTypeName(dst->Type())));
-    }
-  } else {
-    if (src.IsType<paddle::framework::LoDTensor>()) {
-      paddle::framework::Variable new_dst;
-      paddle::imperative::SelectedRowsAddTensor(*dst, src, &new_dst);
-      *dst = std::move(new_dst);
-    } else {
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-          "Unexpected branch, output variable type is %s",
-          paddle::framework::ToTypeName(dst->Type())));
-    }
-  }
-}
-
-}  // namespace egr
--- a/paddle/fluid/eager/api/generated/.gitignore
+++ b/paddle/fluid/eager/api/generated/.gitignore
 fluid_generated/**
+eager_generated/**
\ No newline at end of file
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
 cc_library(scale_node SRCS scale_node.cc DEPS global_utils pten pten_api grad_node_info)
+
+if(NOT ON_INFER)
+cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps})
+add_dependencies(final_dygraph_node eager_final_state_codegen)
+endif()
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
 cc_library(eager_scale SRCS scale.cc DEPS pten_api pten autograd_meta scale_node)
+
+if(NOT ON_INFER)
+cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps})
+add_dependencies(final_dygraph_function eager_final_state_codegen)
+endif()
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
-#add_subdirectory(final_state_generator)
+add_subdirectory(final_state_generator)

 set(EAGER_GENERETOR_DEPS ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)


--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1220,7 +1220,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(

        // According to op_proto->attrs()

-        egr::legacy::RunOp("op_type", ins, outs, attr_map,
+        Controller.Instance().GetCurrentTracer()->TraceOp("op_type", ins, outs,
+  attr_map,
  Controller.Instance().GetExpectedPlace(), {});

        // According to fwd_outputs_names
@@ -1401,7 +1402,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
  const char* FWD_TRACE_OP_TEMPLATE =
      "  paddle::framework::AttributeMap attrs = attr_map;\n"
      "  paddle::framework::AttributeMap default_attrs;\n"
-      "  egr::legacy::RunOp(\"%s\", ins, outs, attrs, \n"
+      "  egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, "
+      "outs, attrs, \n"
      "     egr::Controller::Instance().GetExpectedPlace(),\n"
      "     &default_attrs, true, {});\n";
  std::string trace_op_str =
@@ -1712,7 +1714,8 @@ static std::string GenerateSingleOpBase(
      "  // Pass the entire attribute map to TraceOp\n"
      "  // The underlying kernel will pickup whatever attribute they need "
      "at runtime\n"
-      "  egr::legacy::RunOp(\"%s\", %s, %s, %s,\n"
+      "  egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", %s, "
+      "%s, %s,\n"
      "      egr::Controller::Instance().GetExpectedPlace(),\n"
      "      &this->default_attr_map_, false, {});\n";
  std::string trace_opbase_str = paddle::string::Sprintf(
@@ -1822,7 +1825,8 @@ static std::string GenerateGradNodeCCContents(
    // Visit each OpBase
    for(auto iter = "grad_node->begin()"; iter < "grad_node->end()"; iter++) {
        // Simply pass entire attribute map to kernels
-        egr::legacy::RunOp("iter->Type()", ins, outs, this->attr_map_,
+        Controller.Instance().GetCurrentTracer()->TraceOp("iter->Type()", ins,
+  outs, this->attr_map_,
            egr::Controller::Instance().ExpectedPlace(), false, {});
    }

@@ -2054,6 +2058,7 @@ static std::string GenerateDygraphHFileIncludes() {
      "#include \"paddle/fluid/eager/autograd_meta.h\"\n"
      "#include \"paddle/pten/api/all.h\"\n"
      "#include \"paddle/fluid/eager/utils.h\"\n"
+      "#include \"paddle/fluid/imperative/tracer.h\"\n"
      "#include \"paddle/fluid/framework/op_registry.h\"\n\n";

  dygraph_forward_api_includes_str +=
@@ -2084,8 +2089,7 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
      "dygraph_forward_api.h\"\n"
      "#include "
      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"
-      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
-      "#include \"paddle/fluid/eager/legacy/op_runner.h\"\n";
+      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n";
  std::string forward_cc_include_str =
      paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
  std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
@@ -2099,7 +2103,7 @@ static void GenerateNodeHFile(const std::string& node_h_path,
  std::string node_h_include_str =
      "#pragma once\n"
      "#include \"paddle/fluid/eager/tensor_wrapper.h\"\n"
-      "#include \"paddle/fluid/eager/legacy/op_runner.h\"\n"
+      "#include \"paddle/fluid/imperative/tracer.h\"\n"
      "#include \"paddle/fluid/eager/grad_node_info.h\"\n\n";
  std::ofstream node_h_stream(node_h_path, std::ios::out);
  node_h_stream << node_h_include_str;

--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -2,13 +2,14 @@ set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml")
 set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml")
 set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
 set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
-set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_node.cc")
-set(tmp_nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_node.h")
+set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
+set(tmp_nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.h")
 set(forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.cc")
 set(forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h")
-set(nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/node.cc")
-set(nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/node.h")
+set(nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.cc")
+set(nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h")

+message("Final State Eager CodeGen")
 add_custom_target(eager_final_state_codegen
    COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py" 
            "--api_yaml_path=${api_yaml_path}"

--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
--- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
+++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
@@ -15,9 +15,45 @@
 import sys
 import os

-if __name__ == "__main__":
-    assert len(sys.argv) == 2
-    eager_dir = sys.argv[1]
+
+def GenerateFileStructureForFinalDygraph(eager_dir):
+    """
+    paddle/fluid/eager
+    |- generated
+    |  |- CMakeLists.txt
+    |  |  "add_subdirectory(forwards), add_subdirectory(backwards)"
+    |  
+    |  |- forwards
+    |     |- "dygraph_functions.cc"
+    |     |- "dygraph_functions.h"
+    |
+    |  |- backwards
+    |     |- "nodes.cc"
+    |     |- "nodes.h"
+    """
+    # Directory Generation
+    generated_dir = os.path.join(eager_dir, "api/generated/eager_generated")
+    forwards_dir = os.path.join(generated_dir, "forwards")
+    nodes_dir = os.path.join(generated_dir, "backwards")
+    dirs = [generated_dir, forwards_dir, nodes_dir]
+    for directory in dirs:
+        if not os.path.exists(directory):
+            os.mkdir(directory)
+
+    # Empty files
+    dygraph_forward_api_h_path = os.path.join(generated_dir,
+                                              "dygraph_functions.h")
+    empty_files = [dygraph_forward_api_h_path]
+    empty_files.append(os.path.join(forwards_dir, "dygraph_functions.cc"))
+    empty_files.append(os.path.join(nodes_dir, "nodes.cc"))
+    empty_files.append(os.path.join(nodes_dir, "nodes.h"))
+
+    for path in empty_files:
+        if not os.path.exists(path):
+            open(path, 'a').close()
+
+
+def GenerateFileStructureForIntermediateDygraph(eager_dir):
    """
    paddle/fluid/eager
    |- generated
@@ -79,3 +115,10 @@ if __name__ == "__main__":

    with open(generated_level_cmakelist_path, "w") as f:
        f.write("add_subdirectory(forwards)\nadd_subdirectory(nodes)")
+
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 2
+    eager_dir = sys.argv[1]
+    GenerateFileStructureForIntermediateDygraph(eager_dir)
+    GenerateFileStructureForFinalDygraph(eager_dir)
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -18,10 +18,10 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 // pten deps
-#include "paddle/pten/api/all.h"
+#include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/api/lib/api_declare.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 /**
 * This class is used by Eager mode for now. It's painful to do this in Eager
 * Mode, the better
@@ -245,8 +245,7 @@ class EagerTensor final {
          auto tensor_dense =
              std::dynamic_pointer_cast<pten::DenseTensor>(tensor_->impl());
          if (tensor_dense && tensor_dense.get()) {
-            paddle::experimental::SharesStorage(tensor_dense.get(),
-                                                framework_tensor);
+            *framework_tensor = *tensor_dense;
          } else {
            PADDLE_THROW(paddle::platform::errors::Fatal(
                "Unrecognized egr::EagerTensor type, only "

--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -13,7 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/eager/grad_tensor_holder.h"
-#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
+#include "paddle/fluid/imperative/gradient_accumulator.h"

 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -72,17 +72,17 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
    } else {
      // Accumulation
      if (t.initialized() && buffer_tensor.initialized()) {
-        TensorAdd(t, &buffer_tensor);
+        paddle::imperative::TensorAdd<egr::EagerTensor>(t, &buffer_tensor);
      } else if (t.Var().IsInitialized() &&
                 buffer_tensor.Var().IsInitialized()) {
-        VariableAdd(t, &buffer_tensor);
+        paddle::imperative::VariableAdd(t, &buffer_tensor);
      } else if (t.Var().IsInitialized() && buffer_tensor.initialized()) {
        // TODO(jiabin): This can be merge to upper if case.
        buffer_tensor.SyncToVar();
-        VariableAdd(t, &buffer_tensor);
+        paddle::imperative::VariableAdd(t, &buffer_tensor);
      } else if (t.initialized() && buffer_tensor.Var().IsInitialized()) {
        buffer_tensor.SyncToTensor();
-        TensorAdd(t, &buffer_tensor);
+        paddle::imperative::TensorAdd<egr::EagerTensor>(t, &buffer_tensor);
      } else {
        // Should not happend case
        // 1. both not init

--- a/paddle/fluid/eager/legacy/CMakeLists.txt
+++ b/paddle/fluid/eager/legacy/CMakeLists.txt
-file(GLOB DYGRAPH_LEGACY "*.cpp" "*.cc")
-set(DYGRAPH_LEGACY ${DYGRAPH_LEGACY} PARENT_SCOPE)
--- a/paddle/fluid/eager/legacy/amp_auto_cast.cc
+++ b/paddle/fluid/eager/legacy/amp_auto_cast.cc
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/legacy/amp_auto_cast.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/eager/legacy/op_runner.h"
-#include "paddle/fluid/eager/legacy/tensor_helper.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace egr {
-namespace legacy {
-
-AmpOperators::AmpOperators()
-    : allow_ops_(new std::unordered_set<std::string>()),
-      block_ops_(new std::unordered_set<std::string>()),
-      unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
-  auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
-  auto fp16_dtype = paddle::framework::proto::VarType::FP16;
-  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
-    bool supported = false;
-    for (auto& kernel_type : it->second) {
-      if ((paddle::platform::is_gpu_place(kernel_type.first.place_) ||
-           paddle::platform::is_xpu_place(kernel_type.first.place_)) &&
-          kernel_type.first.data_type_ == fp16_dtype) {
-        supported = true;
-      }
-    }
-    if (!supported) {
-      unsupported_fp16_ops_->insert(it->first);
-    }
-  }
-}
-
-AmpOperators::~AmpOperators() {}
-
-AmpOperators& AmpOperators::Instance() {
-  static AmpOperators instance;
-  return instance;
-}
-
-std::shared_ptr<std::unordered_set<std::string>>
-AmpOperators::GetMutableAllowOps() {
-  return allow_ops_;
-}
-
-std::shared_ptr<std::unordered_set<std::string>>
-AmpOperators::GetMutableBlockOps() {
-  return block_ops_;
-}
-
-std::shared_ptr<std::unordered_set<std::string>>
-AmpOperators::GetMutableUnsupportedFp16Ops() {
-  return unsupported_fp16_ops_;
-}
-
-std::ostream& operator<<(std::ostream& os, AmpOperators& ops) {
-  os << "allow ops: ";
-  auto allow_ops = ops.GetMutableAllowOps();
-  std::copy((*allow_ops).begin(), (*allow_ops).end(),
-            std::ostream_iterator<std::string>(os, " "));
-  os << "\n";
-  os << "block ops: ";
-  auto block_ops = ops.GetMutableBlockOps();
-  std::copy((*block_ops).begin(), (*block_ops).end(),
-            std::ostream_iterator<std::string>(os, " "));
-  os << "\n";
-  os << "unsupported fp16 ops: ";
-  auto unsupported_fp16_ops = ops.GetMutableUnsupportedFp16Ops();
-  std::copy((*unsupported_fp16_ops).begin(), (*unsupported_fp16_ops).end(),
-            std::ostream_iterator<std::string>(os, " "));
-  return os;
-}
-
-inline std::string GetDtypeStr(
-    const std::shared_ptr<egr::EagerTensor>& tensor) {
-  return paddle::framework::DataTypeToString(
-      egr::legacy::GetDtypeFromVar(tensor->Var()));
-}
-
-inline bool NeedCast(const std::shared_ptr<egr::EagerTensor>& tensor) {
-  auto place = egr::legacy::GetPlaceFromVar(tensor->Var());
-  auto data_type = egr::legacy::GetDtypeFromVar(tensor->Var());
-  if (paddle::platform::is_gpu_place(place) ||
-      paddle::platform::is_cuda_pinned_place(place) ||
-      paddle::platform::is_xpu_place(place)) {
-    // CudaPinndePlace is added for varbase created by dataloader
-    if (data_type == paddle::framework::proto::VarType::FP32 ||
-        data_type == paddle::framework::proto::VarType::FP16) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
-// var will be cast back from fp16 to fp32 during backward phase.
-static inline std::shared_ptr<egr::EagerTensor> CastToType(
-    const std::shared_ptr<egr::EagerTensor>& tensor,
-    const paddle::framework::proto::VarType::Type dst_type) {
-  NameTensorMap ins = {{"X", {tensor}}};
-  auto in_data_type = egr::legacy::GetDtypeFromVar(tensor->Var());
-  paddle::framework::AttributeMap attrs = {{"in_dtype", in_data_type},
-                                           {"out_dtype", dst_type}};
-  auto out = std::shared_ptr<egr::EagerTensor>(new egr::EagerTensor());
-  NameTensorMap outs = {{"Out", {out}}};
-
-  {
-    AutoCastGuard guard(paddle::imperative::AmpLevel::O0);
-    paddle::framework::AttributeMap default_attrs;
-    RunOp("cast", ins, outs, std::move(attrs), {}, &default_attrs, true);
-  }
-
-  return out;
-}
-
-static inline std::shared_ptr<egr::EagerTensor> CastToFP16(
-    const std::shared_ptr<egr::EagerTensor>& tensor) {
-  auto dst_type = paddle::framework::proto::VarType::FP16;
-  if (NeedCast(tensor) &&
-      (egr::legacy::GetDtypeFromVar(tensor->Var()) != dst_type)) {
-    return CastToType(tensor, dst_type);
-  }
-  return tensor;
-}
-
-static inline std::shared_ptr<egr::EagerTensor> CastToFP32(
-    const std::shared_ptr<egr::EagerTensor>& tensor) {
-  auto dst_type = paddle::framework::proto::VarType::FP32;
-  if (NeedCast(tensor) &&
-      (egr::legacy::GetDtypeFromVar(tensor->Var()) != dst_type)) {
-    return CastToType(tensor, dst_type);
-  }
-  return tensor;
-}
-
-static inline paddle::framework::proto::VarType::Type GetPromoteType(
-    const std::string& op_type, const NameTensorMap& ins) {
-  auto dst_type = paddle::framework::proto::VarType::FP16;
-  for (const auto& pair : ins) {
-    for (const auto& tensor : pair.second) {
-      if (egr::legacy::GetDtypeFromVar(tensor->Var()) ==
-          paddle::framework::proto::VarType::FP32) {
-        dst_type = egr::legacy::GetDtypeFromVar(tensor->Var());
-        break;
-      }
-    }
-  }
-
-  // NOTE(juncai): moving_average_abs_max_scale only consider the
-  // dtype of input(X)
-  if (op_type == "moving_average_abs_max_scale") {
-    for (const auto& pair : ins) {
-      if (pair.first == "X" &&
-          egr::legacy::GetDtypeFromVar(pair.second.front()->Var()) ==
-              paddle::framework::proto::VarType::FP16) {
-        dst_type = paddle::framework::proto::VarType::FP16;
-      }
-    }
-  }
-
-  return dst_type;
-}
-
-NameTensorMap AutoCastInputs(const std::string& op_type,
-                             const NameTensorMap& ins) {
-  NameTensorMap new_ins(ins);
-  if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) {
-    for (auto& pair : new_ins) {
-      // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
-      if ((op_type == "batch_norm" || op_type == "layer_norm" ||
-           op_type == "sync_batch_norm") &&
-          pair.first != "X") {
-        continue;
-      }
-
-      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
-              << GetDtypeStr(*pair.second.cbegin()) << " to float16";
-      for (auto& var : pair.second) {
-        var = CastToFP16(var);
-      }
-    }
-    return new_ins;
-  } else if (AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
-    for (auto& pair : new_ins) {
-      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
-              << GetDtypeStr(*pair.second.cbegin()) << " to float";
-      for (auto& var : pair.second) {
-        var = CastToFP32(var);
-      }
-    }
-    return new_ins;
-  } else {
-    auto dst_type = GetPromoteType(op_type, ins);
-
-    // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
-    if (dst_type == paddle::framework::proto::VarType::FP16 &&
-        AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(
-            op_type)) {
-      dst_type = paddle::framework::proto::VarType::FP32;
-    }
-    for (auto& pair : new_ins) {
-      // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
-      if ((op_type == "batch_norm" || op_type == "layer_norm" ||
-           op_type == "sync_batch_norm") &&
-          pair.first == "X" &&
-          dst_type == paddle::framework::proto::VarType::FP32) {
-        continue;
-      }
-      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
-              << GetDtypeStr(*pair.second.cbegin()) << " to "
-              << paddle::framework::DataTypeToString(dst_type);
-      for (auto& var : pair.second) {
-        var = (dst_type == paddle::framework::proto::VarType::FP32
-                   ? CastToFP32(var)
-                   : CastToFP16(var));
-      }
-    }
-    return new_ins;
-  }
-  return new_ins;
-}
-
-NameTensorMap CastPureFp16Inputs(const std::string& op_type,
-                                 const NameTensorMap& ins) {
-  NameTensorMap new_ins(ins);
-  auto dst_type = paddle::framework::proto::VarType::FP16;
-  if (AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(op_type) ||
-      AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
-    dst_type = paddle::framework::proto::VarType::FP32;
-  }
-  for (auto& pair : new_ins) {
-    if ((op_type == "batch_norm" || op_type == "layer_norm" ||
-         op_type == "sync_batch_norm") &&
-        pair.first != "X") {
-      continue;
-    }
-    VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
-            << GetDtypeStr(*pair.second.cbegin()) << " to "
-            << paddle::framework::DataTypeToString(dst_type);
-    for (auto& var : pair.second) {
-      var = (dst_type == paddle::framework::proto::VarType::FP32
-                 ? CastToFP32(var)
-                 : CastToFP16(var));
-    }
-  }
-  return new_ins;
-}
-
-}  // namespace legacy
-}  // namespace egr
--- a/paddle/fluid/eager/legacy/amp_auto_cast.h
+++ b/paddle/fluid/eager/legacy/amp_auto_cast.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <set>
-#include <string>
-#include <tuple>
-#include <unordered_set>
-
-#include "paddle/fluid/eager/api/utils/global_utils.h"
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/imperative/amp_auto_cast.h"
-
-namespace egr {
-namespace legacy {
-
-class AmpOperators {
- public:
-  ~AmpOperators();
-  AmpOperators(const AmpOperators& o) = delete;
-  const AmpOperators& operator=(const AmpOperators& o) = delete;
-
-  static AmpOperators& Instance();
-
-  std::shared_ptr<std::unordered_set<std::string>> GetMutableAllowOps();
-
-  std::shared_ptr<std::unordered_set<std::string>> GetMutableBlockOps();
-
-  std::shared_ptr<std::unordered_set<std::string>>
-  GetMutableUnsupportedFp16Ops();
-
- private:
-  AmpOperators();  // forbid calling default constructor
-
-  // The set of ops that support fp16 calculation and are considered numerically
-  // safe and performance critical. These ops are always converted to fp16.
-  std::shared_ptr<std::unordered_set<std::string>> allow_ops_;
-
-  // The set of ops that support fp16 calculation and are considered numerically
-  // dangerous and whose effects may also be observed in downstream ops.
-  std::shared_ptr<std::unordered_set<std::string>> block_ops_;
-
-  // The set of ops that has no fp16 CUDA kennel.
-  std::shared_ptr<std::unordered_set<std::string>> unsupported_fp16_ops_;
-};
-
-std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
-
-// NOTE(zhiqiu): AutoCastGuard is used for RAII.
-class AutoCastGuard {
- public:
-  explicit AutoCastGuard(paddle::imperative::AmpLevel guard_level) {
-    pre_amp_level_ = Controller::Instance().GetAMPLevel();
-
-    if (pre_amp_level_ != guard_level) {
-      Controller::Instance().SetAMPLevel(guard_level);
-    }
-  }
-
-  ~AutoCastGuard() { Controller::Instance().SetAMPLevel(pre_amp_level_); }
-
-  // forbid copy and operator=
-  AutoCastGuard(const AutoCastGuard& guard) = delete;
-  AutoCastGuard& operator=(const AutoCastGuard& guard) = delete;
-
- private:
-  paddle::imperative::AmpLevel pre_amp_level_;
-};
-
-NameTensorMap AutoCastInputs(const std::string& op_type,
-                             const NameTensorMap& ins);
-
-NameTensorMap CastPureFp16Inputs(const std::string& op_type,
-                                 const NameTensorMap& ins);
-
-}  // namespace legacy
-}  // namespace egr
--- a/paddle/fluid/eager/legacy/execution_context.h
+++ b/paddle/fluid/eager/legacy/execution_context.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/framework/variable.h"
-namespace egr {
-namespace legacy {
-
-class EagerExecutionContext : public paddle::framework::ExecutionContext {
-  using Variable = paddle::framework::Variable;
-
- public:
-  EagerExecutionContext(const paddle::framework::OperatorBase& op,
-                        const paddle::framework::Scope& scope,
-                        const paddle::platform::DeviceContext& device_context,
-                        const paddle::framework::RuntimeContext& ctx,
-                        const NameTensorMap& tensor_map_in,
-                        const NameTensorMap& tensor_map_out,
-                        const paddle::framework::AttributeMap& attrs,
-                        const paddle::framework::AttributeMap& default_attrs)
-      : ExecutionContext(op, scope, device_context, ctx),
-        tensor_map_in_(tensor_map_in),
-        tensor_map_out_(tensor_map_out),
-        attrs_(attrs),
-        default_attrs_(default_attrs) {}
-
-  std::string InputName(const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    PADDLE_ENFORCE_NE(it, tensor_map_in_.end(),
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Can not find [%s] in Input", name));
-    // TODO(jiabin): This is used for egr::EagerTensor temporally,
-    // once we have name, remove it.
-    return it->second[0] ? it->second[0]->name()
-                         : paddle::framework::kEmptyVarName;
-  }
-
-  std::vector<std::string> InputNames(const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    PADDLE_ENFORCE_NE(
-        it, tensor_map_in_.end(),
-        paddle::platform::errors::NotFound("Can not find [%s] in Input", name));
-    std::vector<std::string> vec_res;
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        // TODO(jiabin): This is used for egr::EagerTensor
-        // temporally, once we have name, remove it.
-        vec_res.push_back(it->second[i]->name());
-      } else {
-        vec_res.push_back(paddle::framework::kEmptyVarName);
-      }
-    }
-    return vec_res;
-  }
-
-  std::string OutputName(const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    PADDLE_ENFORCE_NE(it, tensor_map_out_.end(),
-                      paddle::platform::errors::NotFound(
-                          "Can not find [%s] in Output", name));
-    return it->second[0] ? it->second[0]->name()
-                         : paddle::framework::kEmptyVarName;
-  }
-
-  std::vector<std::string> OutputNames(const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    PADDLE_ENFORCE_NE(it, tensor_map_out_.end(),
-                      paddle::platform::errors::NotFound(
-                          "Can not find [%s] in Output", name));
-    std::vector<std::string> vec_res;
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        vec_res.push_back(it->second[i]->name());
-      } else {
-        vec_res.push_back(paddle::framework::kEmptyVarName);
-      }
-    }
-    return vec_res;
-  }
-
-  bool HasAttr(const std::string& name) const override {
-    return attrs_.count(name) != 0 || default_attrs_.count(name) != 0;
-  }
-
-  const paddle::framework::AttributeMap& Attrs() const override {
-    return attrs_;
-  }
-
-  const paddle::framework::Attribute& GetAttr(
-      const std::string& name) const override {
-    auto it = attrs_.find(name);
-
-    if (it == attrs_.end()) {
-      it = default_attrs_.find(name);
-      if (it == default_attrs_.end()) {
-        PADDLE_THROW(paddle::platform::errors::NotFound(
-            "Can not find [%s] in attributes of op %s.", name,
-            this->GetOp().Type()));
-      }
-    }
-
-    return it->second;
-  }
-
-  std::vector<std::string> InNameList() const override {
-    std::vector<std::string> vec_temp;
-    vec_temp.reserve(tensor_map_in_.size());
-
-    for (auto& v : tensor_map_in_) {
-      vec_temp.push_back(v.first);
-    }
-
-    return vec_temp;
-  }
-
-  bool HasInput(const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    return (it != tensor_map_in_.end() && it->second.size() > 0);
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    return (it != tensor_map_out_.end() && it->second.size() > 0);
-  }
-
-  size_t InputSize(const std::string& name) const override {
-    return InputNames(name).size();
-  }
-
-  size_t OutputSize(const std::string& name) const override {
-    return OutputNames(name).size();
-  }
-
-  const Variable* InputVar(const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    if (it == tensor_map_in_.end()) {
-      return nullptr;
-    }
-
-    return it->second.empty() || it->second[0] == nullptr
-               ? nullptr
-               : it->second[0]->MutableVar();
-  }
-
-  Variable* OutputVar(const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    if (it == tensor_map_out_.end()) {
-      return nullptr;
-    }
-
-    return it->second.empty() || it->second[0] == nullptr
-               ? nullptr
-               : it->second[0]->MutableVar();
-  }
-
-  const std::vector<Variable*> MultiInputVar(
-      const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    if (it == tensor_map_in_.end()) {
-      return {};
-    }
-    std::vector<Variable*> vec_res;
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      vec_res.push_back(it->second[i] ? it->second[i]->MutableVar() : nullptr);
-    }
-
-    return vec_res;
-  }
-
-  std::vector<Variable*> MultiOutputVar(
-      const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    if (it == tensor_map_out_.end()) {
-      return {};
-    }
-    std::vector<Variable*> vec_res;
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      vec_res.push_back(it->second[i] ? it->second[i]->MutableVar() : nullptr);
-    }
-
-    return vec_res;
-  }
-
- private:
-  const NameTensorMap& tensor_map_in_;
-  const NameTensorMap& tensor_map_out_;
-  const paddle::framework::AttributeMap& attrs_;
-  const paddle::framework::AttributeMap& default_attrs_;
-};
-
-}  // namespace legacy
-}  // namespace egr
--- a/paddle/fluid/eager/legacy/infer_shape_context.h
+++ b/paddle/fluid/eager/legacy/infer_shape_context.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/framework/var_type.h"
-namespace egr {
-namespace legacy {
-
-class EagerInferShapeContext : public paddle::framework::InferShapeContext {
-  using DDim = paddle::framework::DDim;
-
- public:
-  EagerInferShapeContext(
-      const NameTensorMap* in, const NameTensorMap* out,
-      const paddle::framework::AttributeMap* attr,
-      const paddle::framework::AttributeMap* default_attr,
-      const std::string op_type,
-      const paddle::framework::OpKernelType* op_kernel_type = nullptr)
-      : tensor_in_(in),
-        tensor_out_(out),
-        attrs_(attr),
-        default_attrs_(default_attr),
-        op_type_(op_type),
-        op_kernel_type_(op_kernel_type) {}
-
-  bool HasInput(const std::string& name) const override {
-    // has only one input
-    auto it = tensor_in_->find(name);
-
-    if (it == tensor_in_->end()) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() == 0) return false;
-    PADDLE_ENFORCE_EQ(
-        in.size(), 1UL,
-        paddle::platform::errors::PreconditionNotMet(
-            "Input %s should not have more than one inputs", name));
-    return in[0] != nullptr;
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    // has only one output
-    auto it = tensor_out_->find(name);
-    if (it == tensor_out_->end()) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(
-        out.size(), 1UL,
-        paddle::platform::errors::PreconditionNotMet(
-            "Output %s should not have more than one outputs", name));
-    return out[0] != nullptr;
-  }
-
-  bool HasInputs(const std::string& name) const override {
-    auto it = tensor_in_->find(name);
-    if (it == tensor_in_->end() || it->second.empty()) {
-      return false;
-    }
-    for (auto& input : it->second) {
-      if (input == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool HasOutputs(const std::string& name) const override {
-    auto it = tensor_out_->find(name);
-    if (it == tensor_out_->end() || it->second.empty()) {
-      return false;
-    }
-    for (auto& output : it->second) {
-      if (output == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  paddle::framework::AttrReader Attrs() const override {
-    return paddle::framework::AttrReader(*attrs_, *default_attrs_);
-  }
-
-  std::vector<std::string> Inputs(const std::string& name) const override {
-    std::vector<std::string> vec_res;
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(
-        it, tensor_in_->end(),
-        paddle::platform::errors::NotFound("can not find [%s] in input", name));
-
-    vec_res.reserve(it->second.size());
-    for (auto& var : it->second) {
-      if (var) {
-        vec_res.push_back(var->name());
-      } else {
-        vec_res.push_back(paddle::framework::kEmptyVarName);
-      }
-    }
-
-    return vec_res;
-  }
-
-  std::vector<std::string> Outputs(const std::string& name) const override {
-    std::vector<std::string> vec_res;
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-
-    vec_res.reserve(it->second.size());
-    for (auto& var : it->second) {
-      if (var) {
-        vec_res.push_back(var->name());
-      } else {
-        vec_res.push_back(paddle::framework::kEmptyVarName);
-      }
-    }
-
-    return vec_res;
-  }
-  std::string GetInputNameByIdx(size_t idx) const override {
-    auto& op_proto =
-        paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
-    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
-                      paddle::platform::errors::OutOfRange(
-                          "The index should be less than the size of inputs of "
-                          "operator %s, but got index is %d and size is %d",
-                          op_type_, idx, op_proto->inputs().size()));
-    return op_proto->inputs()[idx].name();
-  }
-
-  std::string GetOutputNameByIdx(size_t idx) const override {
-    auto& op_proto =
-        paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
-    PADDLE_ENFORCE_LT(
-        idx, op_proto->outputs().size(),
-        paddle::platform::errors::OutOfRange(
-            "The index should be less than the size of outputs of "
-            "operator %s, but got index is %d and size is %d",
-            op_type_, idx, op_proto->outputs().size()));
-    return op_proto->outputs()[idx].name();
-  }
-
-  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) override {
-    auto in_it = tensor_in_->find(in);
-    auto out_it = tensor_out_->find(out);
-    PADDLE_ENFORCE_NE(
-        in_it, tensor_in_->end(),
-        paddle::platform::errors::NotFound("can not found [%s] in input", in));
-    PADDLE_ENFORCE_GT(in_it->second.size(), i,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Inputs %s should have %llu argument", in, i));
-    PADDLE_ENFORCE_NE(
-        out_it, tensor_out_->end(),
-        paddle::platform::errors::NotFound("can not found [%s] in input", in));
-    PADDLE_ENFORCE_GT(out_it->second.size(), j,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Outputs %s should have %llu argument", out, j));
-
-    paddle::framework::Variable* in_var = in_it->second[i]->MutableVar();
-    paddle::framework::Variable* out_var = out_it->second[j]->MutableVar();
-
-    PADDLE_ENFORCE_EQ(in_var->Type(), out_var->Type(),
-                      paddle::platform::errors::PreconditionNotMet(
-                          "The type of %s and %s is not the same.", in, out));
-
-    if (in_var->IsType<paddle::framework::LoDTensor>()) {
-      auto& in_lod_tensor = in_var->Get<paddle::framework::LoDTensor>();
-      auto* out_lod_tensor =
-          out_var->GetMutable<paddle::framework::LoDTensor>();
-      out_lod_tensor->Resize(in_lod_tensor.dims());
-    } else {
-      auto& in_sele_rows = in_var->Get<pten::SelectedRows>();
-      auto out_sele_rows = out_var->GetMutable<pten::SelectedRows>();
-      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
-      out_sele_rows->set_rows(in_sele_rows.rows());
-      out_sele_rows->set_height(in_sele_rows.height());
-    }
-  }
-
-  void ShareAllLoD(const std::string& in,
-                   const std::string& out) const override {
-    // do nothing
-  }
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override {
-    // do nothing
-  }
-
-  bool IsRuntime() const override { return true; }
-
-  bool IsRunMKLDNNKernel() const override {
-    return (op_kernel_type_ && (op_kernel_type_->data_layout_ ==
-                                paddle::framework::DataLayout::kMKLDNN));
-  }
-
-  std::vector<paddle::framework::InferShapeVarPtr> GetInputVarPtrs(
-      const std::string& name) const override {
-    std::vector<paddle::framework::InferShapeVarPtr> res;
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_in_->end(),
-                      paddle::platform::errors::NotFound(
-                          "Can not find [%s] in inputs.", name));
-    for (auto& tensor : it->second) {
-      res.emplace_back(tensor->MutableVar());
-    }
-    return res;
-  }
-
-  std::vector<paddle::framework::InferShapeVarPtr> GetOutputVarPtrs(
-      const std::string& name) const override {
-    std::vector<paddle::framework::InferShapeVarPtr> res;
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "Can not find [%s] in outputs.", name));
-    for (auto& tensor : it->second) {
-      res.emplace_back(tensor->MutableVar());
-    }
-    return res;
-  }
-
-  DDim GetInputDim(const std::string& name) const override {
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(
-        it, tensor_in_->end(),
-        paddle::platform::errors::NotFound("can not find [%s] in input", name));
-    PADDLE_ENFORCE_EQ(
-        it->second.size(), 1UL,
-        paddle::platform::errors::PreconditionNotMet(
-            "Input(%s) should hold one element, but now it holds %d", name,
-            it->second.size()));
-    return this->GetDim(it->second[0]->MutableVar());
-  }
-
-  std::vector<DDim> GetInputsDim(const std::string& name) const override {
-    // const std::vector<Variable*>& vars = InputVars(name);
-    std::vector<DDim> vec_res;
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_in_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        vec_res.emplace_back(GetDim(it->second[i]->MutableVar()));
-      } else {
-        vec_res.emplace_back();
-      }
-    }
-
-    return vec_res;
-  }
-
-  std::vector<paddle::framework::proto::VarType::Type> GetInputsVarType(
-      const std::string& name) const override {
-    std::vector<paddle::framework::proto::VarType::Type> vec_res;
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(
-        it, tensor_in_->end(),
-        paddle::platform::errors::NotFound("can not find [%s] in input", name));
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        vec_res.emplace_back(
-            paddle::framework::ToVarType(it->second[i]->MutableVar()->Type()));
-      } else {
-        vec_res.emplace_back();
-      }
-    }
-    return vec_res;
-  }
-
-  std::vector<paddle::framework::proto::VarType::Type> GetOutputsVarType(
-      const std::string& name) const override {
-    std::vector<paddle::framework::proto::VarType::Type> vec_res;
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        vec_res.emplace_back(
-            paddle::framework::ToVarType(it->second[i]->MutableVar()->Type()));
-      } else {
-        vec_res.emplace_back(
-            static_cast<paddle::framework::proto::VarType::Type>(-1));
-      }
-    }
-    return vec_res;
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-
-    if (it->second[0]) {
-      SetDim(it->second[0]->MutableVar(), dim);
-    }
-  }
-
-  void SetOutputsDim(const std::string& name,
-                     const std::vector<DDim>& dims) override {
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-
-    PADDLE_ENFORCE_EQ(dims.size(), it->second.size(),
-                      paddle::platform::errors::InvalidArgument(
-                          "The number of dims is expected to be equal to the "
-                          "number of Outputs(%s). But receieved: the number of "
-                          "dims = %d, the number of Outputs(%s) = %d.",
-                          name, dims.size(), name, it->second.size()));
-
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (it->second[i]) {
-        SetDim(it->second[i]->MutableVar(), dims[i]);
-      }
-    }
-  }
-
-  int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "GetLoDLevel function not support in dygraph mode"));
-  }
-
-  void SetLoDLevel(const std::string& out, int32_t lod_level,
-                   size_t j = 0) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "SetLoDLevel function not support in dygraph mode"));
-  }
-
- protected:
-  DDim GetDim(paddle::framework::Variable* var) const {
-    PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::PreconditionNotMet(
-                                     "Input variable should not be null"));
-    if (var->IsType<paddle::framework::LoDTensor>()) {
-      return var->Get<paddle::framework::LoDTensor>().dims();
-    } else if (var->IsType<pten::SelectedRows>()) {
-      return var->Get<pten::SelectedRows>().GetCompleteDims();
-    } else {
-      PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
-          "type_id is xx."));
-    }
-  }
-
-  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "GetRepeatedDims not support in dygraph runtime"));
-  }
-
-  void SetDim(paddle::framework::Variable* var, const DDim& dim) {
-    if (var->IsType<paddle::framework::LoDTensor>()) {
-      var->GetMutable<paddle::framework::LoDTensor>()->Resize(dim);
-    } else if (var->IsType<pten::SelectedRows>()) {
-      var->GetMutable<pten::SelectedRows>()->set_height(dim[0]);
-    } else {
-      PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-          "Variable type_id %s, expect LoDTensor/SelectedRows."));
-    }
-  }
-
-  void SetDims(const std::vector<paddle::framework::Variable*>& vars,
-               const std::vector<DDim>& dims) {
-    size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(
-        length, dims.size(),
-        paddle::platform::errors::PreconditionNotMet(
-            "Vars number [%d] should be equal with dims number [%d]", length,
-            dims.size()));
-    for (size_t i = 0; i < length; ++i) {
-      if (vars[i] == nullptr) {
-        continue;
-      }
-      SetDim(vars[i], dims[i]);
-    }
-  }
-
-  void SetRepeatedDims(const std::string& name,
-                       const std::vector<DDim>& dims) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "SetRepeatedDims not support in dygraph runtime"));
-  }
-
- private:
-  const NameTensorMap* tensor_in_;
-  const NameTensorMap* tensor_out_;
-  const paddle::framework::AttributeMap* attrs_;
-  const paddle::framework::AttributeMap* default_attrs_;
-  const std::string op_type_;
-  const paddle::framework::OpKernelType* op_kernel_type_;
-};
-
-}  // namespace legacy
-}  // namespace egr
--- a/paddle/fluid/eager/legacy/infer_var_type_context.h
+++ b/paddle/fluid/eager/legacy/infer_var_type_context.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/legacy/tensor_helper.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/pten/api/all.h"
-
-namespace egr {
-namespace legacy {
-
-// infer var type context for imperative mode
-class TensorRuntimeInferVarTypeContext
-    : public paddle::framework::InferVarTypeContext {
- public:
-  TensorRuntimeInferVarTypeContext(
-      const NameTensorMap& inputs, const NameTensorMap& outputs,
-      const paddle::framework::AttributeMap& attrs_map,
-      const paddle::framework::AttributeMap& default_attrs_map)
-      : InferVarTypeContext(nullptr, nullptr),
-        inputs_(inputs),
-        outputs_(outputs),
-        attrs_(attrs_map),
-        default_attrs_(default_attrs_map) {}
-
-  virtual ~TensorRuntimeInferVarTypeContext() {}
-
-  paddle::framework::Attribute GetAttr(const std::string& name) const override {
-    auto it = attrs_.find(name);
-
-    if (it == attrs_.end()) {
-      it = default_attrs_.find(name);
-      if (it == default_attrs_.end()) {
-        PADDLE_THROW(paddle::platform::errors::NotFound(
-            "Can not find [%s] in attributes.", name));
-      }
-    }
-
-    return it->second;
-  }
-
-  bool HasInput(const std::string& name) const override {
-    auto it = inputs_.find(name);
-    return (it != inputs_.end() && it->second.size() > 0);
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    auto it = outputs_.find(name);
-    return (it != outputs_.end() && it->second.size() > 0);
-  }
-
-  size_t InputSize(const std::string& name) const {
-    return inputs_.at(name).size();
-  }
-
-  const std::string& InputVarName(const std::string& name,
-                                  const int index = 0) const {
-    // TODO(jiabin): Support this usage inputs_.at(name)[index]->Name()
-    auto it = inputs_.find(name);
-    PADDLE_ENFORCE_NE(it, inputs_.end(),
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Can not find [%s] in Input", name));
-    return inputs_.at(name)[index]->name();
-  }
-
-  bool InputTypeAnyOf(
-      const std::string& name,
-      paddle::framework::proto::VarType::Type type) const override {
-    auto& inputs = inputs_.at(name);
-    return std::any_of(
-        inputs.begin(), inputs.end(),
-        [&type](const std::shared_ptr<egr::EagerTensor>& var) {
-          return paddle::framework::ToVarType(var->Var().Type()) == type;
-        });
-  }
-
-  bool InputTypeAllOf(
-      const std::string& name,
-      paddle::framework::proto::VarType::Type type) const override {
-    auto& inputs = inputs_.at(name);
-    return std::all_of(
-        inputs.begin(), inputs.end(),
-        [&type](const std::shared_ptr<egr::EagerTensor>& var) {
-          return paddle::framework::ToVarType(var->Var().Type()) == type;
-        });
-  }
-
-  void SyncTypeAndDataType(const std::string& input_name,
-                           const std::string& output_name,
-                           int index = 0) override {
-    auto in_tensor = inputs_.at(input_name)[index];
-    auto out_tensor = outputs_.at(output_name)[index];
-    if (in_tensor != out_tensor) {
-      this->SetTensorType(
-          out_tensor, paddle::framework::ToVarType(in_tensor->Var().Type()));
-    }
-  }
-
-  void SetOutputType(const std::string& name,
-                     paddle::framework::proto::VarType::Type type,
-                     int index = 0) override {
-    if (index == paddle::framework::ALL_ELEMENTS) {
-      for (auto& item : outputs_.at(name)) {
-        this->SetTensorType(item, type);
-      }
-    } else {
-      auto& var = outputs_.at(name)[index];
-      this->SetTensorType(var, type);
-    }
-  }
-
-  void SetTensorType(std::shared_ptr<egr::EagerTensor> out,
-                     paddle::framework::proto::VarType::Type type) {
-    switch (type) {
-      case paddle::framework::proto::VarType::LOD_TENSOR: {
-        out->MutableVar()->GetMutable<paddle::framework::LoDTensor>();
-        break;
-      }
-      case paddle::framework::proto::VarType::SELECTED_ROWS: {
-        out->MutableVar()->GetMutable<pten::SelectedRows>();
-        break;
-      }
-      default: {
-        PADDLE_THROW(paddle::platform::errors::NotFound(
-            "Cannot found var type: %s while running runtime InferVarType",
-            paddle::framework::ToTypeName(type)));
-      }
-    }
-  }
-
-  paddle::framework::proto::VarType::Type GetInputType(
-      const std::string& name, const int& index = 0) const override {
-    return paddle::framework::ToVarType(inputs_.at(name)[index]->Var().Type());
-  }
-
-  paddle::framework::proto::VarType::Type GetOutputType(
-      const std::string& name, const int& index = 0) const override {
-    // TODO(jiabin): Support SelectedRows when we have it.
-    return paddle::framework::proto::VarType::LOD_TENSOR;
-  }
-
-  paddle::framework::proto::VarType::Type GetInputDataType(
-      const std::string& name, const int& index = 0) const override {
-    return inputs_.at(name)[index]
-        ->Var()
-        .Get<paddle::framework::LoDTensor>()
-        .type();
-  }
-
-  void SetOutputDataType(const std::string& name,
-                         paddle::framework::proto::VarType::Type type,
-                         int index = 0) override {
-    // TODO(jiabin): It seems doesn't make sense to set data_type in EagerMode.
-  }
-
-  bool IsDygraph() const override { return true; }
-
- protected:
-  bool HasVar(const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "HasVar is not supported in runtime InferVarType"));
-  }
-
-  const std::vector<std::string>& InputVars(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "InputVars is not supported in runtime InferVarType"));
-  }
-
-  const std::vector<std::string>& OutputVars(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "OutputVars is not supported in runtime InferVarType"));
-  }
-
-  paddle::framework::proto::VarType::Type GetVarType(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not manipulate var in runtime InferVarType"));
-  }
-
-  void SetVarType(const std::string& name,
-                  paddle::framework::proto::VarType::Type type) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not manipulate var in runtime InferVarType"));
-  }
-
-  paddle::framework::proto::VarType::Type GetVarDataType(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not manipulate var in runtime InferVarType"));
-  }
-
-  void SetVarDataType(const std::string& name,
-                      paddle::framework::proto::VarType::Type type) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not manipulate var in runtime InferVarType"));
-  }
-
-  std::vector<paddle::framework::proto::VarType::Type> GetVarDataTypes(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "GetVarDataTypes is not supported in runtime InferVarType"));
-  }
-
-  void SetVarDataTypes(
-      const std::string& name,
-      const std::vector<paddle::framework::proto::VarType::Type>&
-          multiple_data_type) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "SetVarDataTypes is not supported in runtime InferVarType"));
-  }
-
-  std::vector<int64_t> GetVarShape(const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not handle Shape in runtime InferVarType"));
-  }
-
-  void SetVarShape(const std::string& name,
-                   const std::vector<int64_t>& dims) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not handle Shape in runtime InferVarType"));
-  }
-
-  int32_t GetVarLoDLevel(const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not handle LoDLevel in runtime InferVarType"));
-  }
-
-  void SetVarLoDLevel(const std::string& name, int32_t lod_level) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not handle LoDLevel in runtime InferVarType"));
-  }
-
- private:
-  const NameTensorMap& inputs_;
-  const NameTensorMap& outputs_;
-  const paddle::framework::AttributeMap& attrs_;
-  const paddle::framework::AttributeMap& default_attrs_;
-};
-
-}  // namespace legacy
-}  // namespace egr
--- a/paddle/fluid/eager/legacy/op_runner.cc
+++ b/paddle/fluid/eager/legacy/op_runner.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/legacy/op_runner.h"
-#include <map>
-#include <set>
-#include <unordered_set>
-#include <utility>
-#include "paddle/fluid/eager/legacy/amp_auto_cast.h"
-#include "paddle/fluid/eager/legacy/infer_var_type_context.h"
-#include "paddle/fluid/eager/legacy/prepared_operator.h"
-#include "paddle/fluid/eager/legacy/tensor_helper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/denormal.h"
-#include "paddle/fluid/string/string_helper.h"
-
-DECLARE_bool(use_mkldnn);
-DECLARE_string(tracer_mkldnn_ops_on);
-DECLARE_string(tracer_mkldnn_ops_off);
-
-namespace egr {
-namespace legacy {
-
-void OpRunImpl(const paddle::framework::OperatorBase& op,
-               const NameTensorMap& ins, const NameTensorMap& outs,
-               const paddle::framework::AttributeMap& attrs,
-               const paddle::framework::AttributeMap& default_attrs,
-               const paddle::platform::Place& place) {
-  VLOG(6) << "Get Opertor With Kernel";
-  auto* op_kernel =
-      dynamic_cast<const paddle::framework::OperatorWithKernel*>(&op);
-  PADDLE_ENFORCE_NOT_NULL(
-      op_kernel, paddle::platform::errors::PermissionDenied(
-                     "Only support operator with kernel in Dygraph mode."));
-  auto& info = op.Info();
-  if (info.infer_var_type_) {
-    VLOG(6) << "Run InferVarType";
-    egr::legacy::TensorRuntimeInferVarTypeContext infer_var_type_ctx(
-        ins, outs, attrs, default_attrs);
-    VLOG(9) << "Actual Run InferVarType";
-    info.infer_var_type_(&infer_var_type_ctx);
-  }
-  VLOG(6) << "Initialize output tensor";
-  // Initialize output tensor
-  for (auto& tensor_pair : outs) {
-    for (auto& tensor : tensor_pair.second) {
-      if (tensor && tensor.get() && (!tensor->Var().IsInitialized())) {
-        InitializeVariable(tensor->MutableVar(),
-                           paddle::framework::proto::VarType::LOD_TENSOR);
-      }
-    }
-  }
-
-  /**
-   * [ Why need temporary inputs here? ]
-   *
-   * PrepareData should not change original input tensor inplace.
-   * Suppose the user defines a tensor(int), enters an op to execute,
-   * and then this op rewrites GetExpectedKernelForVar, and converts
-   * this tensor to float type during execution. After the dynamic
-   * graph is executed, the user-defined variable will be lost, and
-   * the user cannot get the originally defined int tensor, because
-   * it has been converted to float, this should be regarded as a bug
-   * in certain usage scenarios
-   *
-   * In static graph mode, when op is executed, a temporary scope
-   * `transfer_scope` is created before PrepareData, the data after
-   * transform is stored in the temporary scope, and then discarded
-   * after the execution of op, but the original input is directly
-   * overwritten in the previous dynamic graph implemention.
-   */
-  VLOG(6) << "Prepare Op";
-  auto prepared_op = egr::legacy::PreparedOp::Prepare(
-      ins, outs, *op_kernel, place, attrs, default_attrs);
-  VLOG(6) << "Prepare Data";
-  auto tmp_ins_ptr =
-      egr::legacy::PrepareData(*op_kernel, ins, prepared_op.kernel_type());
-  VLOG(6) << "Run Prepared Op";
-  if (tmp_ins_ptr == nullptr) {
-    prepared_op.Run(ins, outs, attrs, default_attrs);
-  } else {
-    prepared_op.Run(*tmp_ins_ptr, outs, attrs, default_attrs);
-  }
-
-  VLOG(6) << "Run Prepared Op end";
-  // TODO(jiabin): Set the output var's grad Forward DataType
-}
-
-void RunOp(const std::string& type, const NameTensorMap& ins,
-           const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
-           const paddle::platform::Place& place,
-           paddle::framework::AttributeMap* default_attrs,
-           bool override_default_attr_map,
-           const std::map<std::string, std::string>& inplace_map) {
-  VLOG(1) << "Run Op: " << type;
-  if (FLAGS_use_mkldnn) {
-    // if both lists are empty all ops are enabled (default for
-    // FLAGS_use_mkldnn=1)
-    // if ops_on list is not empty only ops from that list are enabled
-    if (!FLAGS_tracer_mkldnn_ops_on.empty()) {
-      auto is_on = FLAGS_tracer_mkldnn_ops_on.find(type) != std::string::npos;
-      attrs["use_mkldnn"] = is_on;
-    } else {
-      // if ops_on list is empty all ops are enabled except types from off_list
-      auto is_off = FLAGS_tracer_mkldnn_ops_off.find(type) != std::string::npos;
-      attrs["use_mkldnn"] = !is_off;
-    }
-  }
-  auto op = paddle::framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
-
-  PADDLE_ENFORCE_NOT_NULL(default_attrs,
-                          paddle::platform::errors::PermissionDenied(
-                              "Detected default_attrs = nullptr."));
-
-  if (override_default_attr_map) {
-    const auto& op_info = op->Info();
-    auto* attr_checker = op_info.Checker();
-    if (attr_checker) {
-      attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
-    }
-
-    static paddle::framework::AttributeMap empty_attrs_map = {};
-    *default_attrs = attr_checker == nullptr
-                         ? empty_attrs_map
-                         : attr_checker->GetDefaultAttrMap();
-  }
-
-  auto amp_level = egr::Controller::Instance().GetAMPLevel();
-  VLOG(6) << "Check AMP status";
-  NameTensorMap new_ins = ins;
-  if (amp_level == paddle::imperative::AmpLevel::O1) {
-    VLOG(5) << "Auto mixed precision run operator: " << type;
-    new_ins = AutoCastInputs(type, ins);
-  } else if (amp_level == paddle::imperative::AmpLevel::O2) {
-    VLOG(5) << "Pure fp16 run operator: " << type;
-    new_ins = CastPureFp16Inputs(type, ins);
-  }
-
-  try {
-    VLOG(6) << "Get Device id";
-    if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      paddle::platform::SetDeviceId(place.device);
-#else
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with GPU if use CUDAPlace."));
-#endif
-    } else if (paddle::platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-      paddle::platform::SetXPUDeviceId(place.device);
-#else
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with XPU if use XPUPlace."));
-#endif
-    } else if (paddle::platform::is_npu_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      paddle::platform::SetNPUDeviceId(place.device);
-#else
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with NPU if use NPUPlace."));
-#endif
-    }
-    VLOG(6) << "Step in OpRunImpl";
-    OpRunImpl(*op, new_ins, outs, attrs, *default_attrs, place);
-  } catch (paddle::platform::EnforceNotMet& exception) {
-    paddle::framework::AppendErrorOpHint(type, &exception);
-    throw std::move(exception);
-  } catch (std::exception& ex) {
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Operator %s raises an %s exception.\n"
-        "The exception content is\n:%s.",
-        type, paddle::platform::demangle(typeid(ex).name()), ex.what()));
-  } catch (...) {
-    // NOTE: this branch represents a very serious bug with
-    // low probability of occurrence, and we can't get its
-    // exception content here.
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Operator %s raises an unknown exception.", type));
-  }
-  VLOG(6) << "Finish Run Op";
-  // TODO(jiabin): Support this later
-  // if (enable_program_desc_tracing_) {
-  //   VLOG(5) << "Trace op " << type << " into ProgramDesc";
-  //   program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
-  // }
-}
-
-}  // namespace legacy
-}  // namespace egr
--- a/paddle/fluid/eager/legacy/prepared_operator.cc
+++ b/paddle/fluid/eager/legacy/prepared_operator.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/legacy/prepared_operator.h"
-#include "paddle/fluid/imperative/prepared_operator.h"
-
-#include "paddle/fluid/eager/legacy/infer_shape_context.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/pten_utils.h"
-#include "paddle/utils/small_vector.h"
-#ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
-#endif
-DECLARE_bool(check_nan_inf);
-DECLARE_bool(run_pten_kernel);
-
-namespace egr {
-namespace legacy {
-
-const paddle::framework::Tensor* GetTensorFromVar(
-    const paddle::framework::Variable& var) {
-  if (var.IsType<paddle::framework::LoDTensor>()) {
-    return &(var.Get<paddle::framework::LoDTensor>());
-  } else if (var.IsType<pten::SelectedRows>()) {
-    return &(var.Get<pten::SelectedRows>().value());
-  } else {
-    return nullptr;
-  }
-}
-
-static const paddle::framework::Attribute& GetAttr(
-    const paddle::framework::AttributeMap& attrs,
-    const paddle::framework::AttributeMap& default_attrs,
-    const std::string& name) {
-  auto it = attrs.find(name);
-  bool found = it != attrs.end();
-  if (!found) {
-    it = default_attrs.find(name);
-    found = it != default_attrs.end();
-  }
-  PADDLE_ENFORCE_EQ(found, true,
-                    paddle::platform::errors::NotFound(
-                        "(%s) is not found in AttributeMap.", name));
-  return it->second;
-}
-
-static void HandleComplexGradToRealGrad(const NameTensorMap& outs) {
-  // TODO(jiabin): Support complex forward datatype later.
-}
-
-PreparedOp::PreparedOp(
-    const paddle::framework::OperatorBase& op,
-    const paddle::framework::RuntimeContext& ctx,
-    const paddle::framework::OpKernelType& kernel_type,
-    const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
-    paddle::platform::DeviceContext* dev_ctx)
-    : op_(op),
-      ctx_(ctx),
-      kernel_type_(kernel_type),
-      func_(func),
-      dev_ctx_(dev_ctx) {}
-
-PreparedOp::PreparedOp(
-    const paddle::framework::OperatorBase& op,
-    const paddle::framework::RuntimeContext& ctx,
-    const paddle::framework::OpKernelType& kernel_type,
-    const paddle::framework::KernelSignature& kernel_signature,
-    const pten::Kernel& pt_kernel, paddle::platform::DeviceContext* dev_ctx)
-    : op_(op),
-      ctx_(ctx),
-      kernel_type_(kernel_type),
-      func_(nullptr),
-      dev_ctx_(dev_ctx),
-      run_pten_kernel_(true),
-      pt_kernel_signature_(kernel_signature),
-      pt_kernel_(pt_kernel) {}
-
-PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
-                       const paddle::framework::OperatorWithKernel& op,
-                       const paddle::platform::Place& place,
-                       const paddle::framework::AttributeMap& attrs,
-                       const paddle::framework::AttributeMap& default_attrs) {
-  VLOG(6) << "Preparing an Op";
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  paddle::framework::RuntimeContext ctx({}, {});
-
-#ifdef PADDLE_WITH_MKLDNN
-  // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
-  // GetKernelType functions, so we need to copy the attributes there.
-  // Const qualifier of Attrs had to be discarded to overwrite it.
-  if (FLAGS_use_mkldnn) {
-    auto& mutable_op_attrs =
-        const_cast<paddle::framework::AttributeMap&>(op.Attrs());
-    mutable_op_attrs = default_attrs;
-    for (auto& attr : attrs) {
-      mutable_op_attrs[attr.first] = attr.second;
-    }
-  }
-#endif
-
-  // 1. get expected kernel key
-  auto dygraph_exe_ctx = egr::legacy::EagerExecutionContext(
-      op, paddle::framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
-      default_attrs);
-  auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx);
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  // fit for pten
-  pten::KernelSignature pt_kernel_signature;
-  pten::KernelKey pt_kernel_key;
-  std::string pt_kernel_name;
-  if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
-    pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
-    VLOG(6) << pt_kernel_signature;
-
-    pt_kernel_name = pt_kernel_signature.name;
-    pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key);
-    auto pt_kernel = pten::KernelFactory::Instance().SelectKernel(
-        pt_kernel_name, pt_kernel_key);
-
-    if (pt_kernel.IsValid()) {
-      VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
-              << " | kernel key: " << pt_kernel_key
-              << " | kernel: " << pt_kernel;
-
-      // TODO(chenweihang): using CPUKernel when miss device kernel case
-      return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
-                        pt_kernel, dev_ctx);
-    } else {
-      VLOG(6) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
-              << "` not found.";
-    }
-  }
-
-  // 2. check if op[type] has kernel registered.
-  auto& all_op_kernels = op.AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(op.Type());
-
-  if (kernels_iter == all_op_kernels.end() ||
-      kernels_iter->second.find(expected_kernel_key) ==
-          kernels_iter->second.end()
-#ifdef PADDLE_WITH_XPU
-      ||
-      paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-          !paddle::platform::is_xpu_support_op(op.Type(),
-                                               expected_kernel_key) ||
-      paddle::platform::is_in_xpu_black_list(op.Type())
-#endif
-          ) {
-    if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
-      auto pt_cpu_kernel_key =
-          FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
-      auto pt_cpu_kernel = pten::KernelFactory::Instance().SelectKernel(
-          pt_kernel_name, pt_cpu_kernel_key);
-      if (pt_cpu_kernel.IsValid()) {
-        VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
-                << " | kernel key: " << pt_cpu_kernel_key
-                << " | kernel: " << pt_cpu_kernel;
-        return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
-                          pt_cpu_kernel, dev_ctx);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_NE(
-      kernels_iter, all_op_kernels.end(),
-      paddle::platform::errors::NotFound(
-          "There are no kernels which are registered in the %s operator.",
-          op.Type()));
-  auto& kernels = kernels_iter->second;
-  auto kernel_iter = kernels.find(expected_kernel_key);
-
-#ifdef PADDLE_WITH_XPU
-  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-      (kernel_iter == kernels.end() ||
-       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
-       paddle::platform::is_in_xpu_black_list(op.Type()))) {
-    VLOG(3) << "missing XPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = paddle::platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (kernel_iter == kernels.end() &&
-      paddle::platform::is_npu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "missing NPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = paddle::platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
-  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
-  // case
-  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                    paddle::platform::errors::NotFound(
-                        "Operator %s does not have kernel for %s.", op.Type(),
-                        KernelTypeToString(expected_kernel_key)));
-
-  if (!(expected_kernel_key.place_ == place)) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
-  }
-  VLOG(6) << "Construct Prepared Op";
-  return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx);
-}
-
-PreparedOp PreparedOp::Prepare(
-    const NameTensorMap& ins, const NameTensorMap& outs,
-    const paddle::framework::OperatorWithKernel& op,
-    const paddle::platform::Place& place,
-    const paddle::framework::AttributeMap& attrs,
-    const paddle::framework::AttributeMap& default_attrs) {
-  return PrepareImpl(ins, outs, op, place, attrs, default_attrs);
-}
-
-static void PreparedOpRunImpl(
-    const paddle::framework::OperatorBase& op,
-    const paddle::framework::RuntimeContext& ctx,
-    const paddle::framework::OpKernelType& kernel_type,
-    const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
-    paddle::platform::DeviceContext* dev_ctx, const NameTensorMap& ins,
-    const NameTensorMap& outs, const paddle::framework::AttributeMap& attrs,
-    const paddle::framework::AttributeMap& default_attrs) {
-  // TODO(zjl): remove scope in dygraph
-  VLOG(6) << "Runing Prepared Op";
-  paddle::framework::Scope scope;
-
-  EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
-                                         op.Type(), &kernel_type);
-  op.Info().infer_shape_(&infer_shape_ctx);
-
-  func(EagerExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs,
-                             default_attrs));
-
-  if (FLAGS_check_nan_inf) {
-    paddle::framework::details::CheckOpHasNanOrInfInEager<EagerTensor>(
-        op.Type(), outs, dev_ctx->GetPlace());
-  }
-
-  /**
-   * [ Why need handle complex gradient to real gradient? ]
-   *
-   * After the introduction of complex number calculations, Ops that support
-   * complex number calculations generally support type promotion, such as
-   * x(float32) + y(complex64) = out(complex64), then the type of the grad
-   * tensor should be dout(complex64), dx(float32), dy (complex64).
-   *
-   * But because the dout is complex64, the dx is also complex64 after
-   * grad op kernel executed, we need to recognize this situation and
-   * convert dx to float32 type. HandleComplexGradToRealGrad does this thing.
-   */
-  if (paddle::framework::IsComplexType(kernel_type.data_type_)) {
-    HandleComplexGradToRealGrad(outs);
-  }
-  VLOG(6) << "Finish Runing Prepared Op";
-}
-
-static void PreparedOpRunPtImpl(
-    const paddle::framework::OperatorBase& op,
-    const paddle::framework::OpKernelType& kernel_type,
-    const paddle::framework::KernelSignature& pt_kernel_signature,
-    const pten::Kernel& pt_kernel, paddle::platform::DeviceContext* dev_ctx,
-    const NameTensorMap& ins, const NameTensorMap& outs,
-    const paddle::framework::AttributeMap& attrs,
-    const paddle::framework::AttributeMap& default_attrs) {
-  EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
-                                         op.Type());
-  static_cast<const paddle::framework::OperatorWithKernel&>(op).InferShape(
-      &infer_shape_ctx);
-
-  paddle::imperative::PreparePtenData<EagerTensor>(
-      pt_kernel, pt_kernel_signature,
-      static_cast<paddle::imperative::NameTensorMap>(ins));
-
-  pten::KernelContext pt_kernel_context;
-  paddle::imperative::BuildDygraphPtenKernelContext<EagerTensor>(
-      pt_kernel_signature, pt_kernel,
-      static_cast<paddle::imperative::NameTensorMap>(ins),
-      static_cast<paddle::imperative::NameTensorMap>(outs), attrs,
-      default_attrs, dev_ctx, &pt_kernel_context);
-
-  pt_kernel(&pt_kernel_context);
-
-  // TODO(chenweihang): add debug flags later
-  // TODO(chenweihang): deal with complex cases later
-}
-
-void PreparedOp::Run(const NameTensorMap& ins, const NameTensorMap& outs,
-                     const paddle::framework::AttributeMap& attrs,
-                     const paddle::framework::AttributeMap& default_attrs) {
-  if (run_pten_kernel_) {
-    PreparedOpRunPtImpl(op_, kernel_type_, pt_kernel_signature_, pt_kernel_,
-                        dev_ctx_, ins, outs, attrs, default_attrs);
-  } else {
-    PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs,
-                      attrs, default_attrs);
-  }
-}
-
-std::shared_ptr<NameTensorMap> PrepareData(
-    const paddle::framework::OperatorWithKernel& op, const NameTensorMap& ins,
-    const paddle::framework::OpKernelType& expected_kernel_key) {
-  std::shared_ptr<NameTensorMap> tmp_ins_ptr = nullptr;
-  for (const auto& name_pair : ins) {
-    for (size_t i = 0; i < name_pair.second.size(); ++i) {
-      auto& egr_tensor = name_pair.second[i];
-      const auto* tensor = GetTensorFromVar(egr_tensor->Var());
-      if (tensor && tensor->IsInitialized()) {
-        auto kernel_type_for_var = op.GetKernelTypeForVar(
-            name_pair.first, *tensor, expected_kernel_key);
-        if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
-          continue;
-        } else {
-          // TODO(jiabin): Support Cache later
-          VLOG(3) << "Transform Variable " << egr_tensor->name() << " from "
-                  << kernel_type_for_var << " to " << expected_kernel_key;
-          paddle::framework::Tensor out;
-          TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
-                        &out);
-          if (NeedTransformDataType(kernel_type_for_var, expected_kernel_key)) {
-            // To avoid NameVarMap copy construction overhead in general
-            // scenarios, if inplace transformed, return original input
-            // directly
-            if (tmp_ins_ptr == nullptr) {
-              tmp_ins_ptr = std::make_shared<NameTensorMap>(ins);
-            }
-            auto tmp_egr_tensor =
-                std::make_shared<EagerTensor>(egr_tensor->name());
-            SetTensorToVariable(egr_tensor->Var(), out,
-                                tmp_egr_tensor->MutableVar());
-            (*tmp_ins_ptr)[name_pair.first][i] = tmp_egr_tensor;
-          } else {
-            // if dtype is same, transform inplace will not change the
-            // original
-            // value, transform inplace to avoid multiple copy
-            SetTensorToVariable(egr_tensor->Var(), out,
-                                egr_tensor->MutableVar());
-          }
-        }
-      }
-    }
-  }
-  return tmp_ins_ptr;
-}
-
-}  // namespace legacy
-}  // namespace egr
--- a/paddle/fluid/eager/legacy/prepared_operator.h
+++ b/paddle/fluid/eager/legacy/prepared_operator.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/eager/legacy/execution_context.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/framework/data_transform.h"
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/type_defs.h"
-
-DECLARE_bool(use_mkldnn);
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace pten {
-class DenseTensor;
-}  // namespace pten
-
-namespace egr {
-namespace legacy {
-
-const paddle::framework::Tensor* GetTensorFromVar(
-    const paddle::framework::Variable& var);
-
-std::shared_ptr<NameTensorMap> PrepareData(
-    const paddle::framework::OperatorWithKernel& op, const NameTensorMap& ins,
-    const paddle::framework::OpKernelType& expected_kernel_key);
-
-class PreparedOp {
- public:
-  PreparedOp(const paddle::framework::OperatorBase& op,
-             const paddle::framework::RuntimeContext& ctx,
-             const paddle::framework::OpKernelType& kernel_type,
-             const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
-             paddle::platform::DeviceContext* dev_ctx);
-
-  PreparedOp(const paddle::framework::OperatorBase& op,
-             const paddle::framework::RuntimeContext& ctx,
-             const paddle::framework::OpKernelType& kernel_type,
-             const paddle::framework::KernelSignature& kernel_signature,
-             const pten::Kernel& pt_kernel,
-             paddle::platform::DeviceContext* dev_ctx);
-
-  static PreparedOp Prepare(
-      const NameTensorMap& ins, const NameTensorMap& outs,
-      const paddle::framework::OperatorWithKernel& op,
-      const paddle::platform::Place& place,
-      const paddle::framework::AttributeMap& attrs,
-      const paddle::framework::AttributeMap& default_attrs);
-
-  void Run(const NameTensorMap& in, const NameTensorMap& out,
-           const paddle::framework::AttributeMap& attrs,
-           const paddle::framework::AttributeMap& default_attrs);
-
-  const paddle::framework::OpKernelType& kernel_type() const {
-    return kernel_type_;
-  }
-
- private:
-  const paddle::framework::OperatorBase& op_;
-  const paddle::framework::RuntimeContext& ctx_;
-  paddle::framework::OpKernelType kernel_type_;
-  paddle::framework::OperatorWithKernel::OpKernelFunc func_;
-  paddle::platform::DeviceContext* dev_ctx_;
-
-  // NOTE(chenweihang): Similar op members are used to adapt to
-  // new pten kernel, if there is a better design in the future,
-  // we may polish the implementation here
-  bool run_pten_kernel_{false};
-  paddle::framework::KernelSignature pt_kernel_signature_;
-  pten::Kernel pt_kernel_;
-};
-
-}  // namespace legacy
-}  // namespace egr
--- a/paddle/fluid/eager/legacy/tensor_helper.cc
+++ b/paddle/fluid/eager/legacy/tensor_helper.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/legacy/tensor_helper.h"
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace egr {
-namespace legacy {
-
-void InitializeVariable(paddle::framework::Variable *var,
-                        paddle::framework::proto::VarType::Type var_type) {
-  if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) {
-    var->GetMutable<paddle::framework::LoDTensor>();
-  } else if (var_type == paddle::framework::proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<pten::SelectedRows>();
-  } else if (var_type == paddle::framework::proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<paddle::framework::FeedList>();
-  } else if (var_type == paddle::framework::proto::VarType::FETCH_LIST) {
-    var->GetMutable<paddle::framework::FetchList>();
-  } else if (var_type == paddle::framework::proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<paddle::framework::Scope *>>();
-  } else if (var_type == paddle::framework::proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<paddle::framework::LoDRankTable>();
-  } else if (var_type == paddle::framework::proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<paddle::framework::LoDTensorArray>();
-  } else if (var_type == paddle::framework::proto::VarType::STRINGS) {
-    var->GetMutable<paddle::framework::Strings>();
-  } else if (var_type == paddle::framework::proto::VarType::VOCAB) {
-    var->GetMutable<paddle::framework::Vocab>();
-  } else if (var_type == paddle::framework::proto::VarType::PLACE_LIST) {
-    var->GetMutable<paddle::platform::PlaceList>();
-  } else if (var_type == paddle::framework::proto::VarType::READER) {
-    var->GetMutable<paddle::framework::ReaderHolder>();
-  } else if (var_type == paddle::framework::proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "paddle::framework::Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
-        var_type));
-  }
-}
-
-void CopyVariable(const paddle::framework::Variable &src_var,
-                  paddle::framework::Variable *dst_var) {
-  // only support cpu now
-  auto cpu_place = paddle::platform::CPUPlace();
-
-  if (src_var.IsType<paddle::framework::LoDTensor>()) {
-    auto *tmp_grad_tensor = dst_var->GetMutable<paddle::framework::LoDTensor>();
-    auto &src_tensor = src_var.Get<paddle::framework::LoDTensor>();
-    tmp_grad_tensor->set_lod(src_tensor.lod());
-    paddle::framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor);
-  } else if (src_var.IsType<pten::SelectedRows>()) {
-    auto &src_slr = src_var.Get<pten::SelectedRows>();
-    auto *tmp_grad_slr = dst_var->GetMutable<pten::SelectedRows>();
-    tmp_grad_slr->set_rows(src_slr.rows());
-    tmp_grad_slr->set_height(src_slr.height());
-    auto &src_t = src_slr.value();
-    auto *dst_t = tmp_grad_slr->mutable_value();
-    paddle::framework::TensorCopy(src_t, cpu_place, dst_t);
-  } else {
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "Unknown variable type to copy."));
-  }
-}
-paddle::framework::proto::VarType::Type GetDtypeFromVar(
-    const paddle::framework::Variable &var) {
-  if (var.IsType<paddle::framework::LoDTensor>()) {
-    return var.Get<paddle::framework::LoDTensor>().type();
-  } else if (var.IsType<pten::SelectedRows>()) {
-    return var.Get<pten::SelectedRows>().value().type();
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "Variable type is %s, expect LoDTensor or SelectedRows.",
-        paddle::framework::ToTypeName(var.Type())));
-  }
-}
-const paddle::platform::Place &GetPlaceFromVar(
-    const paddle::framework::Variable &var) {
-  if (var.IsType<paddle::framework::LoDTensor>()) {
-    return var.Get<paddle::framework::LoDTensor>().place();
-  } else if (var.IsType<pten::SelectedRows>()) {
-    return var.Get<pten::SelectedRows>().place();
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "Variable type is %s, expect LoDTensor or SelectedRows.",
-        paddle::framework::ToTypeName(var.Type())));
-  }
-}
-
-}  // namespace legacy
-}  // namespace egr
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -36,11 +36,6 @@
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT

-// Disable pten path
-DECLARE_bool(run_pten_kernel);
-
-TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
-
 TEST(Benchmark, EagerScaleCPU) {
  // Prepare Device Contexts
  eager_test::InitEnv(paddle::platform::CPUPlace());

--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -35,10 +35,6 @@
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT

-DECLARE_bool(run_pten_kernel);
-
-TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

 TEST(Benchmark, EagerScaleCUDA) {

--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -34,11 +34,6 @@
 #include "gperftools/profiler.h"
 #endif

-// Disable pten path
-DECLARE_bool(run_pten_kernel);
-
-TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
-
 namespace paddle {
 namespace imperative {


--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -34,11 +34,6 @@
 #include "gperftools/profiler.h"
 #endif

-// Disable pten path
-DECLARE_bool(run_pten_kernel);
-
-TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

 namespace paddle {

--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -214,7 +214,7 @@ void benchmark_fluid_scale(const std::shared_ptr<imperative::VarBase>& X,
         {std::shared_ptr<imperative::VarBase>(
             new imperative::VarBase(true, "Out"))}}};

-    tracer.TraceOp("scale", ins, outs, attrs, place, true);
+    tracer.TraceOp<VarBase>("scale", ins, outs, attrs, place, true);

    tmp_out = outs["Out"][0];
  }
@@ -250,7 +250,7 @@ void benchmark_fluid_matmul(const std::shared_ptr<imperative::VarBase>& X,
         {std::shared_ptr<imperative::VarBase>(
             new imperative::VarBase(true, "Out"))}}};

-    tracer.TraceOp("matmul_v2", ins, outs, attrs, place, true);
+    tracer.TraceOp<VarBase>("matmul_v2", ins, outs, attrs, place, true);

    tmp_out = outs["Out"][0];
  }
@@ -288,7 +288,7 @@ void benchmark_fluid_mlp(
             {std::shared_ptr<imperative::VarBase>(
                 new imperative::VarBase(true, "Out"))}}};

-    tracer.TraceOp("matmul_v2", ins, outs, attrs, place, true);
+    tracer.TraceOp<VarBase>("matmul_v2", ins, outs, attrs, place, true);

    // EW-Add0
    ins = {{"X", outs["Out"]}, {"Y", {Bs[i]}}};
@@ -296,7 +296,7 @@ void benchmark_fluid_mlp(
             {std::shared_ptr<imperative::VarBase>(
                 new imperative::VarBase(true, "Out"))}}};

-    tracer.TraceOp("elementwise_add", ins, outs, attrs, place, true);
+    tracer.TraceOp<VarBase>("elementwise_add", ins, outs, attrs, place, true);
    input0 = outs["Out"][0];
  }

@@ -307,7 +307,7 @@ void benchmark_fluid_mlp(
               new imperative::VarBase(true, "Out"))}}};
  attrs = {{"reduce_all", true}};

-  tracer.TraceOp("reduce_sum", ins, outs, attrs, place, true);
+  tracer.TraceOp<VarBase>("reduce_sum", ins, outs, attrs, place, true);

  auto* engine = tracer.GetEngine();
  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};

--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -286,4 +286,43 @@ void EagerUtils::CheckAndRetainGrad(
  }
 }

+paddle::experimental::Tensor EagerUtils::SyncToPtenTensors(
+    const egr::EagerTensor& tensor) {
+  const_cast<EagerTensor*>(&tensor)->SyncToTensor();
+  return *tensor.Tensor().get();
+}
+
+std::vector<paddle::experimental::Tensor> EagerUtils::SyncToPtenTensors(
+    const std::vector<egr::EagerTensor>& tensors) {
+  std::vector<paddle::experimental::Tensor> res;
+  size_t num = tensors.size();
+  res.reserve(num);
+  for (size_t i = 0; i < num; i++) {
+    const_cast<EagerTensor*>(&(tensors[i]))->SyncToTensor();
+    res.push_back(*tensors[i].Tensor().get());
+  }
+  return res;
+}
+
+egr::EagerTensor EagerUtils::CreateEagerTensorFromTensor(
+    const paddle::experimental::Tensor& tensor) {
+  egr::EagerTensor ret;
+  ret.set_tensor(std::make_shared<paddle::experimental::Tensor>(tensor));
+  return ret;
+}
+
+std::vector<egr::EagerTensor> EagerUtils::CreateEagerTensorFromTensor(
+    const std::vector<paddle::experimental::Tensor>& tensors) {
+  std::vector<egr::EagerTensor> res;
+  size_t num = tensors.size();
+  res.reserve(num);
+  for (size_t i = 0; i < num; i++) {
+    egr::EagerTensor tmp;
+    tmp.set_tensor(std::make_shared<paddle::experimental::Tensor>(tensors[i]));
+    res.emplace_back(std::move(tmp));
+  }
+
+  return res;
+}
+
 }  // namespace egr
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -170,6 +170,16 @@ class EagerUtils {

  static void CheckAndRetainGrad(const egr::EagerTensor& tensor);
  static void CheckAndRetainGrad(const std::vector<egr::EagerTensor>& tensors);
+
+  static paddle::experimental::Tensor SyncToPtenTensors(
+      const egr::EagerTensor& tensor);
+  static std::vector<paddle::experimental::Tensor> SyncToPtenTensors(
+      const std::vector<egr::EagerTensor>& tensors);
+
+  static egr::EagerTensor CreateEagerTensorFromTensor(
+      const paddle::experimental::Tensor& tensor);
+  static std::vector<egr::EagerTensor> CreateEagerTensorFromTensor(
+      const std::vector<paddle::experimental::Tensor>& tensors);
 };

 }  // namespace egr
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -293,7 +293,7 @@ if(WITH_DISTRIBUTE)
    ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
    pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
    device_context scope framework_proto trainer_desc_proto glog fs shell 
-    fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
+    fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer
    lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS}
    graph_to_program_pass variable_helper data_feed_proto timer monitor
    heter_service_proto fleet_executor ${BRPC_DEP})
@@ -315,7 +315,7 @@ if(WITH_DISTRIBUTE)
            pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
            device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
            index_sampler index_wrapper sampler index_dataset_proto
-            lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
+            lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
            graph_to_program_pass variable_helper timer monitor heter_service_proto fleet heter_server brpc fleet_executor)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
@@ -336,7 +336,7 @@ if(WITH_DISTRIBUTE)
            ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
            pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
            device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
-            lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
+            lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
            graph_to_program_pass variable_helper timer monitor fleet_executor)
  endif()
 elseif(WITH_PSLIB)

--- a/paddle/fluid/framework/custom_kernel.cc
+++ b/paddle/fluid/framework/custom_kernel.cc
@@ -25,12 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/api/ext/op_kernel_info.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_context.h"
 #include "paddle/pten/core/kernel_registry.h"

-DECLARE_bool(run_pten_kernel);
-
 namespace paddle {

 namespace framework {
@@ -279,10 +277,6 @@ static void RunKernelFunc(pten::KernelContext* ctx,

 void RegisterKernelWithMetaInfo(
    const std::vector<OpKernelInfo>& op_kernel_infos) {
-  PADDLE_ENFORCE_EQ(FLAGS_run_pten_kernel, true,
-                    platform::errors::Unimplemented(
-                        "Custom Kernel depends on pten kernel enabled,"));
-
  for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
    auto& kernel_info = op_kernel_infos[i];
    auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);

--- a/paddle/fluid/framework/custom_kernel_test.cc
+++ b/paddle/fluid/framework/custom_kernel_test.cc
@@ -212,11 +212,13 @@ TEST(CustomKernel, custom_kernel_dot) {
  kernel_context.EmplaceBackAttr(fake_attr_int64_vec);
  kernel_context.EmplaceBackAttr(fake_attr_int_vec);

-  auto out_meta = pten::DotInferMeta(dense_x->meta(), dense_y->meta());
  auto dense_out = std::make_shared<pten::DenseTensor>(
      pten::make_intrusive<paddle::experimental::SharedStorage>(
          pten::TransToFluidPlace(backend)),
-      std::move(out_meta));
+      pten::DenseTensorMeta());
+
+  pten::MetaTensor meta_out(dense_out.get());
+  pten::DotInferMeta(*dense_x, *dense_y, &meta_out);
  kernel_context.EmplaceBackOutput(dense_out.get());  // idx:0 index:[0,1)

  // fake_input_vec: idx:1, index:[1,3)

--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/api_declare.h"
 #include "paddle/pten/api/lib/ext_compat_utils.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/utils/any.h"

 namespace paddle {
@@ -110,8 +110,8 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                          const std::vector<std::string>& outputs,
                          const std::vector<std::string>& attrs) {
  VLOG(3) << "Custom Operator: Start run KernelFunc.";
-  std::vector<paddle::experimental::Tensor> custom_ins;
-  std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
+  // prepare CustomOpKernelContext
+  paddle::CustomOpKernelContext kernel_ctx;
  for (auto& in_name : inputs) {
    VLOG(3) << "Custom Operator: input name - " << in_name;
    if (detail::IsDuplicableVar(in_name)) {
@@ -136,7 +136,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
        custom_t.set_impl(std::make_shared<pten::DenseTensor>(*x));
        custom_vec_in.emplace_back(custom_t);
      }
-      custom_vec_ins.emplace_back(custom_vec_in);
+      kernel_ctx.EmplaceBackInputs(std::move(custom_vec_in));
    } else {
      auto* x = ctx.Input<Tensor>(in_name);
      PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound(
@@ -146,33 +146,32 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                            "Input tensor (%s) is not initialized.", in_name));
      paddle::experimental::Tensor custom_in;
      custom_in.set_impl(std::make_shared<pten::DenseTensor>(*x));
-      custom_ins.emplace_back(custom_in);
+      kernel_ctx.EmplaceBackInput(std::move(custom_in));
    }
  }

-  std::vector<paddle::any> custom_attrs;
  for (auto& attr_str : attrs) {
    auto attr_name_and_type = detail::ParseAttrStr(attr_str);
    auto attr_name = attr_name_and_type[0];
    auto attr_type_str = attr_name_and_type[1];
    if (attr_type_str == "bool") {
-      custom_attrs.emplace_back(ctx.Attr<bool>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<bool>(attr_name));
    } else if (attr_type_str == "int") {
-      custom_attrs.emplace_back(ctx.Attr<int>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<int>(attr_name));
    } else if (attr_type_str == "float") {
-      custom_attrs.emplace_back(ctx.Attr<float>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<float>(attr_name));
    } else if (attr_type_str == "int64_t") {
-      custom_attrs.emplace_back(ctx.Attr<int64_t>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<int64_t>(attr_name));
    } else if (attr_type_str == "std::string") {
-      custom_attrs.emplace_back(ctx.Attr<std::string>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::string>(attr_name));
    } else if (attr_type_str == "std::vector<int>") {
-      custom_attrs.emplace_back(ctx.Attr<std::vector<int>>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<int>>(attr_name));
    } else if (attr_type_str == "std::vector<float>") {
-      custom_attrs.emplace_back(ctx.Attr<std::vector<float>>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<float>>(attr_name));
    } else if (attr_type_str == "std::vector<int64_t>") {
-      custom_attrs.emplace_back(ctx.Attr<std::vector<int64_t>>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<int64_t>>(attr_name));
    } else if (attr_type_str == "std::vector<std::string>") {
-      custom_attrs.emplace_back(ctx.Attr<std::vector<std::string>>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<std::string>>(attr_name));
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(
          "Unsupported `%s` type value as custom attribute now. "
@@ -185,39 +184,75 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
    }
  }

-  VLOG(3) << "Custom Operator: Run ComputeFunc.";
-  try {
-    auto outs = func(custom_ins, custom_vec_ins, custom_attrs);
+  VLOG(3) << "Custom Operator: push outputs into CustomOpKernelContext.";
+  // cache the target tensor pointers
+  std::vector<Tensor*> true_out_ptrs;
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto out_name = outputs[i];
+    if (detail::IsDuplicableVar(out_name)) {
+      PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL,
+                     platform::errors::PreconditionNotMet(
+                         "If custom operator's outputs contains `paddle::Vec("
+                         ")` type, "
+                         "it only can hold one output."));
+      auto vec_out = ctx.MultiOutput<Tensor>(out_name);
+      PADDLE_ENFORCE_NE(vec_out.empty(), true,
+                        platform::errors::NotFound(
+                            "Output vector<tensor> (%s) is empty.", out_name));
+      std::vector<paddle::experimental::Tensor> custom_vec_out;
+      for (size_t j = 0; j < vec_out.size(); ++j) {
+        auto* out = vec_out[j];
+        PADDLE_ENFORCE_NOT_NULL(
+            out,
+            platform::errors::NotFound(
+                "The %d-th tensor in output vector<tensor> (%s) is nullptr.", j,
+                out_name));
+        true_out_ptrs.emplace_back(out);
+        paddle::experimental::Tensor custom_t;
+        // here only can copy the output tensor into context
+        custom_t.set_impl(std::make_shared<pten::DenseTensor>(*out));
+        custom_vec_out.emplace_back(custom_t);
+      }
+      kernel_ctx.EmplaceBackOutputs(std::move(custom_vec_out));
+    } else {
+      auto* out = ctx.Output<Tensor>(out_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          out, platform::errors::NotFound("Output tensor (%s) is nullptr.",
+                                          out_name));
+      true_out_ptrs.emplace_back(out);
+      paddle::experimental::Tensor custom_out;
+      // here only can copy the output tensor into context
+      custom_out.set_impl(std::make_shared<pten::DenseTensor>(*out));
+      kernel_ctx.EmplaceBackOutput(std::move(custom_out));
+    }
+  }

-    VLOG(3) << "Custom Operator: Share outputs into ExecutionContext.";
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      auto out_name = outputs[i];
-      if (detail::IsDuplicableVar(out_name)) {
-        PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL,
-                       platform::errors::PreconditionNotMet(
-                           "If custom operator's outputs contains `paddle::Vec("
-                           ")` type, "
-                           "it only can hold one output."));
-        auto vec_true_outs = ctx.MultiOutput<Tensor>(out_name);
-        PADDLE_ENFORCE_EQ(
-            vec_true_outs.size(), outs.size(),
-            platform::errors::InvalidArgument(
-                "The number of element in custom operator outputs is wrong, "
-                "expected contains %d Tensors, but actually contains %d "
-                "Tensors.",
-                vec_true_outs.size(), outs.size()));
-        for (size_t j = 0; j < vec_true_outs.size(); ++j) {
-          experimental::SharesStorage(
-              std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(j).impl())
-                  .get(),
-              vec_true_outs.at(j));
-        }
-      } else {
-        auto* true_out = ctx.Output<Tensor>(out_name);
-        experimental::SharesStorage(
-            std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(i).impl())
-                .get(),
-            true_out);
+  try {
+    VLOG(3) << "Custom Operator: Run ComputeFunc.";
+    func(&kernel_ctx);
+
+    // sync output tensor data into original output
+    auto* calc_outs = kernel_ctx.AllMutableOutput();
+    PADDLE_ENFORCE_EQ(
+        true_out_ptrs.size(), calc_outs->size(),
+        platform::errors::InvalidArgument(
+            "The number of element in custom operator outputs is wrong, "
+            "expected contains %d Tensors, but actually contains %d "
+            "Tensors.",
+            true_out_ptrs.size(), calc_outs->size()));
+    for (size_t i = 0; i < true_out_ptrs.size(); ++i) {
+      auto* true_out = true_out_ptrs.at(i);
+      auto calc_out =
+          std::dynamic_pointer_cast<pten::DenseTensor>(calc_outs->at(i).impl());
+      // assgin meta info
+      auto* true_out_meta = pten::DenseTensorUtils::GetMutableMeta(true_out);
+      true_out_meta->dims = calc_out->dims();
+      true_out_meta->dtype = calc_out->dtype();
+      true_out_meta->layout = calc_out->layout();
+      // lod and offset no need to be reset
+      // reset holder if needed
+      if (true_out->Holder() != calc_out->Holder()) {
+        true_out->ResetHolder(calc_out->Holder());
      }
    }
  } catch (platform::EnforceNotMet& exception) {
@@ -613,7 +648,7 @@ void RegisterOperatorWithMetaInfo(
  auto op_name = OpMetaInfoHelper::GetOpName(base_op_meta);

  if (OpInfoMap::Instance().Has(op_name)) {
-    LOG(WARNING) << "Operator (" << op_name << ")has been registered.";
+    LOG(WARNING) << "Operator (" << op_name << ") has been registered.";
    return;
  }


--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -340,6 +340,7 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
  this->thread_id_ = 0;
  this->thread_num_ = 1;
  this->parse_ins_id_ = false;
+  this->parse_uid_ = false;
  this->parse_content_ = false;
  this->parse_logkey_ = false;
  this->enable_pv_merge_ = false;
@@ -498,6 +499,11 @@ void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
  parse_ins_id_ = parse_ins_id;
 }

+template <typename T>
+void InMemoryDataFeed<T>::SetParseUid(bool parse_uid) {
+  parse_uid_ = parse_uid;
+}
+
 template <typename T>
 void InMemoryDataFeed<T>::LoadIntoMemory() {
 #ifdef _LINUX
@@ -1047,6 +1053,7 @@ void MultiSlotInMemoryDataFeed::Init(
      use_slots_shape_.push_back(local_shape);
    }
  }
+  uid_slot_ = multi_slot_desc.uid_slot();
  feed_vec_.resize(use_slots_.size());
  const int kEstimatedFeasignNumPerSlot = 5;  // Magic Number
  for (size_t i = 0; i < all_slot_num; i++) {
@@ -1160,6 +1167,19 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
              "\nWe detect the feasign number of this slot is %d, "
              "which is illegal.",
              str, i, num));
+#ifdef PADDLE_WITH_PSLIB
+      if (parse_uid_ && all_slots_[i] == uid_slot_) {
+        PADDLE_ENFORCE(num == 1 && all_slots_type_[i][0] == 'u',
+                       platform::errors::PreconditionNotMet(
+                           "The uid has to be uint64 and single.\n"
+                           "please check this error line: %s",
+                           str));
+
+        char* uidptr = endptr;
+        uint64_t feasign = (uint64_t)strtoull(uidptr, &uidptr, 10);
+        instance->uid_ = feasign;
+      }
+#endif
      if (idx != -1) {
        if (all_slots_type_[i][0] == 'f') {  // float
          for (int j = 0; j < num; ++j) {

--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -191,6 +191,7 @@ struct Record {
  uint64_t search_id;
  uint32_t rank;
  uint32_t cmatch;
+  std::string uid_;
 };

 inline SlotRecord make_slotrecord() {
@@ -562,6 +563,7 @@ class DataFeed {
  virtual void SetThreadNum(int thread_num) {}
  // This function will do nothing at default
  virtual void SetParseInsId(bool parse_ins_id) {}
+  virtual void SetParseUid(bool parse_uid) {}
  virtual void SetParseContent(bool parse_content) {}
  virtual void SetParseLogKey(bool parse_logkey) {}
  virtual void SetEnablePvMerge(bool enable_pv_merge) {}
@@ -645,6 +647,7 @@ class DataFeed {
  std::vector<std::string> ins_id_vec_;
  std::vector<std::string> ins_content_vec_;
  platform::Place place_;
+  std::string uid_slot_;

  // The input type of pipe reader, 0 for one sample, 1 for one batch
  int input_type_;
@@ -709,6 +712,7 @@ class InMemoryDataFeed : public DataFeed {
  virtual void SetThreadId(int thread_id);
  virtual void SetThreadNum(int thread_num);
  virtual void SetParseInsId(bool parse_ins_id);
+  virtual void SetParseUid(bool parse_uid);
  virtual void SetParseContent(bool parse_content);
  virtual void SetParseLogKey(bool parse_logkey);
  virtual void SetEnablePvMerge(bool enable_pv_merge);
@@ -737,6 +741,7 @@ class InMemoryDataFeed : public DataFeed {
  int thread_id_;
  int thread_num_;
  bool parse_ins_id_;
+  bool parse_uid_;
  bool parse_content_;
  bool parse_logkey_;
  bool enable_pv_merge_;

--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -22,7 +22,10 @@ message Slot {
  repeated int32 shape = 5; // we can define N-D Tensor
 }

-message MultiSlotDesc { repeated Slot slots = 1; }
+message MultiSlotDesc {
+  repeated Slot slots = 1;
+  optional string uid_slot = 2;
+}

 message DataFeedDesc {
  optional string name = 1;

--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -57,6 +57,8 @@ DatasetImpl<T>::DatasetImpl() {
  parse_logkey_ = false;
  preload_thread_num_ = 0;
  global_index_ = 0;
+  shuffle_by_uid_ = false;
+  parse_uid_ = false;
 }

 // set filelist, file_idx_ will reset to zero.
@@ -150,6 +152,12 @@ void DatasetImpl<T>::SetMergeBySid(bool is_merge) {
  merge_by_sid_ = is_merge;
 }

+template <typename T>
+void DatasetImpl<T>::SetShuffleByUid(bool enable_shuffle_uid) {
+  shuffle_by_uid_ = enable_shuffle_uid;
+  parse_uid_ = true;
+}
+
 template <typename T>
 void DatasetImpl<T>::SetEnablePvMerge(bool enable_pv_merge) {
  enable_pv_merge_ = enable_pv_merge;
@@ -664,11 +672,14 @@ void MultiSlotDataset::GlobalShuffle(int thread_num) {
          << input_channel_->Size();

  auto get_client_id = [this, fleet_ptr](const Record& data) -> size_t {
-    if (!this->merge_by_insid_) {
-      return fleet_ptr->LocalRandomEngine()() % this->trainer_num_;
-    } else {
+    if (this->merge_by_insid_) {
      return XXH64(data.ins_id_.data(), data.ins_id_.length(), 0) %
             this->trainer_num_;
+    } else if (this->shuffle_by_uid_) {
+      return XXH64(data.uid_.data(), data.uid_.length(), 0) %
+             this->trainer_num_;
+    } else {
+      return fleet_ptr->LocalRandomEngine()() % this->trainer_num_;
    }
  };

@@ -902,6 +913,7 @@ void DatasetImpl<T>::CreateReaders() {
    readers_[i]->SetFeaNum(&total_fea_num_);
    readers_[i]->SetFileList(filelist_);
    readers_[i]->SetParseInsId(parse_ins_id_);
+    readers_[i]->SetParseUid(parse_uid_);
    readers_[i]->SetParseContent(parse_content_);
    readers_[i]->SetParseLogKey(parse_logkey_);
    readers_[i]->SetEnablePvMerge(enable_pv_merge_);
@@ -972,6 +984,7 @@ void DatasetImpl<T>::CreatePreLoadReaders() {
    preload_readers_[i]->SetFeaNumMutex(&mutex_for_fea_num_);
    preload_readers_[i]->SetFeaNum(&total_fea_num_);
    preload_readers_[i]->SetParseInsId(parse_ins_id_);
+    preload_readers_[i]->SetParseUid(parse_uid_);
    preload_readers_[i]->SetParseContent(parse_content_);
    preload_readers_[i]->SetParseLogKey(parse_logkey_);
    preload_readers_[i]->SetEnablePvMerge(enable_pv_merge_);

--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
  auto cpu_place = paddle::platform::CPUPlace();
  auto gpu_place = paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
-
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
  auto kernel_fp16 = paddle::framework::OpKernelType(
      paddle::framework::proto::VarType::FP16, gpu_place,
      paddle::framework::DataLayout::kAnyLayout,

--- a/paddle/fluid/framework/details/nan_inf_utils.h
+++ b/paddle/fluid/framework/details/nan_inf_utils.h
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
--- a/paddle/fluid/framework/fleet/metrics.cc
+++ b/paddle/fluid/framework/fleet/metrics.cc
--- a/paddle/fluid/framework/fleet/metrics.h
+++ b/paddle/fluid/framework/fleet/metrics.h
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
--- a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
--- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
+++ b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
--- a/paddle/fluid/imperative/infer_var_type_context.h
+++ b/paddle/fluid/imperative/infer_var_type_context.h
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
--- a/paddle/fluid/imperative/jit/program_desc_tracer.h
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.h
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
--- a/paddle/fluid/imperative/tests/test_eager.cc
+++ b/paddle/fluid/imperative/tests/test_eager.cc
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
--- a/paddle/fluid/imperative/var_helper.cc
+++ b/paddle/fluid/imperative/var_helper.cc
--- a/paddle/fluid/imperative/var_helper.h
+++ b/paddle/fluid/imperative/var_helper.h
--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ b/paddle/fluid/operators/broadcast_tensors_op.cu
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ b/paddle/fluid/operators/cholesky_solve_op.cu
--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cu
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
--- a/paddle/fluid/operators/digamma_op.cc
+++ b/paddle/fluid/operators/digamma_op.cc
--- a/paddle/fluid/operators/digamma_op.h
+++ b/paddle/fluid/operators/digamma_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ b/paddle/fluid/operators/graph_send_recv_op.cu
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
--- a/paddle/fluid/operators/kron_op.h
+++ b/paddle/fluid/operators/kron_op.h
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
--- a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
+++ b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ b/paddle/fluid/operators/triangular_solve_op.cu
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
--- a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
--- a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
--- a/paddle/fluid/platform/device/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt
--- a/paddle/fluid/platform/device/mlu/cncl_helper.h
+++ b/paddle/fluid/platform/device/mlu/cncl_helper.h
--- a/paddle/fluid/platform/device/mlu/device_context.h
+++ b/paddle/fluid/platform/device/mlu/device_context.h
--- a/paddle/fluid/platform/device/mlu/enforce.h
+++ b/paddle/fluid/platform/device/mlu/enforce.h
--- a/paddle/fluid/platform/device/mlu/enforce_test.cc
+++ b/paddle/fluid/platform/device/mlu/enforce_test.cc
--- a/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
--- a/paddle/fluid/platform/device/mlu/mlu_info.h
+++ b/paddle/fluid/platform/device/mlu/mlu_info.h
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
--- a/paddle/fluid/platform/profiler/host_event_recorder.cc
+++ b/paddle/fluid/platform/profiler/host_event_recorder.cc
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
--- a/paddle/fluid/platform/profiler/common_event.h
+++ b/paddle/fluid/platform/profiler/common_event.h
--- a/paddle/fluid/platform/profiler/event_tracing.h
+++ b/paddle/fluid/platform/profiler/event_tracing.h
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
--- a/paddle/fluid/platform/profiler/host_tracer.h
+++ b/paddle/fluid/platform/profiler/host_tracer.h
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
--- a/paddle/fluid/platform/profiler/trace_event.h
+++ b/paddle/fluid/platform/profiler/trace_event.h
--- a/paddle/fluid/platform/profiler/trace_event_collector.h
+++ b/paddle/fluid/platform/profiler/trace_event_collector.h
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/metrics_py.cc
+++ b/paddle/fluid/pybind/metrics_py.cc
--- a/paddle/fluid/eager/legacy/op_runner.h
+++ b/paddle/fluid/eager/legacy/op_runner.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
--- a/paddle/pten/api/all.h
+++ b/paddle/pten/api/all.h
--- a/paddle/pten/api/ext/op_meta_info.h
+++ b/paddle/pten/api/ext/op_meta_info.h
--- a/paddle/pten/api/include/kernel_signature.h
+++ b/paddle/pten/api/include/kernel_signature.h
--- a/paddle/pten/api/include/utils.h
+++ b/paddle/pten/api/include/utils.h
--- a/paddle/fluid/operators/digamma_op.cu
+++ b/paddle/fluid/operators/digamma_op.cu
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
--- a/paddle/pten/api/lib/api_declare.h
+++ b/paddle/pten/api/lib/api_declare.h
--- a/paddle/pten/api/lib/api_utils.h
+++ b/paddle/pten/api/lib/api_utils.h
--- a/paddle/pten/api/lib/data_transform.cc
+++ b/paddle/pten/api/lib/data_transform.cc
--- a/paddle/pten/api/lib/data_transform.h
+++ b/paddle/pten/api/lib/data_transform.h
--- a/paddle/pten/api/lib/kernel_dispatch.cc
+++ b/paddle/pten/api/lib/kernel_dispatch.cc
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
--- a/paddle/pten/api/lib/op_meta_info.cc
+++ b/paddle/pten/api/lib/op_meta_info.cc
--- a/paddle/pten/api/lib/sparse_api.cc
+++ b/paddle/pten/api/lib/sparse_api.cc
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
--- a/paddle/pten/api/lib/utils/storage.h
+++ b/paddle/pten/api/lib/utils/storage.h
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
--- a/paddle/pten/backends/cpu/cpu_context.cc
+++ b/paddle/pten/backends/cpu/cpu_context.cc
--- a/paddle/pten/backends/cpu/cpu_context.h
+++ b/paddle/pten/backends/cpu/cpu_context.h
--- a/paddle/pten/backends/gpu/CMakeLists.txt
+++ b/paddle/pten/backends/gpu/CMakeLists.txt
--- a/paddle/pten/backends/gpu/cuda/CMakeLists.txt
+++ b/paddle/pten/backends/gpu/cuda/CMakeLists.txt
--- a/paddle/pten/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/pten/backends/gpu/cuda/cuda_helper.h
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
--- a/paddle/pten/backends/gpu/forwards.h
+++ b/paddle/pten/backends/gpu/forwards.h
--- a/paddle/pten/backends/gpu/gpu_context.cc
+++ b/paddle/pten/backends/gpu/gpu_context.cc
--- a/paddle/pten/backends/gpu/gpu_context.h
+++ b/paddle/pten/backends/gpu/gpu_context.h
--- a/paddle/pten/backends/gpu/gpu_decls.h
+++ b/paddle/pten/backends/gpu/gpu_decls.h
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.h
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.h
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
--- a/paddle/pten/backends/gpu/gpu_info.h
+++ b/paddle/pten/backends/gpu/gpu_info.h
--- a/paddle/pten/backends/gpu/gpu_launch_config.h
+++ b/paddle/pten/backends/gpu/gpu_launch_config.h
--- a/paddle/pten/backends/gpu/gpu_types.h
+++ b/paddle/pten/backends/gpu/gpu_types.h
--- a/paddle/pten/backends/gpu/rocm/CMakeLists.txt
+++ b/paddle/pten/backends/gpu/rocm/CMakeLists.txt
--- a/paddle/pten/backends/gpu/rocm/rocm_helper.h
+++ b/paddle/pten/backends/gpu/rocm/rocm_helper.h
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
--- a/paddle/pten/backends/xpu/xpu_context.cc
+++ b/paddle/pten/backends/xpu/xpu_context.cc
--- a/paddle/pten/backends/xpu/xpu_context.h
+++ b/paddle/pten/backends/xpu/xpu_context.h
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
--- a/paddle/pten/core/compat/CMakeLists.txt
+++ b/paddle/pten/core/compat/CMakeLists.txt
--- a/paddle/pten/core/compat/arg_map_context.cc
+++ b/paddle/pten/core/compat/arg_map_context.cc
--- a/paddle/pten/core/compat/arg_map_context.h
+++ b/paddle/pten/core/compat/arg_map_context.h
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/pten/core/convert_utils.h
--- a/paddle/pten/core/compat/op_utils.h
+++ b/paddle/pten/core/compat/op_utils.h
--- a/paddle/pten/core/compat/type_defs.h
+++ b/paddle/pten/core/compat/type_defs.h
--- a/paddle/pten/core/ddim.cc
+++ b/paddle/pten/core/ddim.cc
--- a/paddle/pten/core/ddim.h
+++ b/paddle/pten/core/ddim.h
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
--- a/paddle/pten/core/dense_tensor_impl.cc
+++ b/paddle/pten/core/dense_tensor_impl.cc
--- a/paddle/pten/core/device_context.cc
+++ b/paddle/pten/core/device_context.cc
--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
--- a/paddle/pten/core/enforce.h
+++ b/paddle/pten/core/enforce.h
--- a/paddle/pten/core/infermeta_utils.h
+++ b/paddle/pten/core/infermeta_utils.h
--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
--- a/paddle/pten/core/kernel_factory.cc
+++ b/paddle/pten/core/kernel_factory.cc
--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
--- a/paddle/pten/core/lod_utils.cc
+++ b/paddle/pten/core/lod_utils.cc
--- a/paddle/pten/core/meta_tensor.cc
+++ b/paddle/pten/core/meta_tensor.cc
--- a/paddle/pten/core/meta_tensor.h
+++ b/paddle/pten/core/meta_tensor.h
--- a/paddle/pten/core/sparse_csr_tensor.cc
+++ b/paddle/pten/core/sparse_csr_tensor.cc
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
--- a/paddle/pten/core/tensor_base.h
+++ b/paddle/pten/core/tensor_base.h
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
--- a/paddle/pten/core/compat_utils.h
+++ b/paddle/pten/core/compat_utils.h
--- a/paddle/pten/core/type_defs.h
+++ b/paddle/pten/core/type_defs.h
--- a/paddle/pten/core/array.h
+++ b/paddle/pten/core/array.h
--- a/paddle/pten/core/dim.h
+++ b/paddle/pten/core/dim.h
--- a/paddle/pten/core/utils/rw_lock.h
+++ b/paddle/pten/core/utils/rw_lock.h
--- a/paddle/pten/core/unroll_array_ops.h
+++ b/paddle/pten/core/unroll_array_ops.h
--- a/paddle/pten/infermeta/CMakeLists.txt
+++ b/paddle/pten/infermeta/CMakeLists.txt
--- a/paddle/pten/infermeta/backward.cc
+++ b/paddle/pten/infermeta/backward.cc
--- a/paddle/pten/infermeta/backward.h
+++ b/paddle/pten/infermeta/backward.h
--- a/paddle/pten/infermeta/binary.cc
+++ b/paddle/pten/infermeta/binary.cc
--- a/paddle/pten/infermeta/binary.h
+++ b/paddle/pten/infermeta/binary.h
--- a/paddle/pten/infermeta/multiary.cc
+++ b/paddle/pten/infermeta/multiary.cc
--- a/paddle/pten/infermeta/multiary.h
+++ b/paddle/pten/infermeta/multiary.h
--- a/paddle/pten/infermeta/nullary.cc
+++ b/paddle/pten/infermeta/nullary.cc
--- a/paddle/pten/infermeta/nullary.h
+++ b/paddle/pten/infermeta/nullary.h
--- a/paddle/pten/infermeta/unary.cc
+++ b/paddle/pten/infermeta/unary.cc
--- a/paddle/pten/infermeta/unary.h
+++ b/paddle/pten/infermeta/unary.h
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
--- a/paddle/pten/kernels/cast_kernel.h
+++ b/paddle/pten/kernels/cast_kernel.h
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
--- a/paddle/pten/kernels/concat_kernel.h
+++ b/paddle/pten/kernels/concat_kernel.h
--- a/paddle/pten/kernels/cpu/copy_kernel.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
--- a/paddle/pten/kernels/cpu/digamma_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/digamma_grad_kernel.cc
--- a/paddle/pten/kernels/cpu/digamma_kernel.cc
+++ b/paddle/pten/kernels/cpu/digamma_kernel.cc
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
--- a/paddle/pten/core/kernel_def.h
+++ b/paddle/pten/core/kernel_def.h
--- a/paddle/fluid/eager/legacy/type_def.h
+++ b/paddle/fluid/eager/legacy/type_def.h
--- a/paddle/pten/kernels/dot_kernel.h
+++ b/paddle/pten/kernels/dot_kernel.h
--- a/paddle/pten/kernels/empty_kernel.h
+++ b/paddle/pten/kernels/empty_kernel.h
--- a/paddle/pten/kernels/flatten_kernel.h
+++ b/paddle/pten/kernels/flatten_kernel.h
--- a/paddle/pten/kernels/full_kernel.h
+++ b/paddle/pten/kernels/full_kernel.h
--- a/paddle/pten/kernels/funcs/concat_funcs.h
+++ b/paddle/pten/kernels/funcs/concat_funcs.h
--- a/paddle/pten/kernels/funcs/cuda_kernel_config.h
+++ b/paddle/pten/kernels/funcs/cuda_kernel_config.h
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
--- a/paddle/pten/kernels/gpu/concat_and_split.h
+++ b/paddle/pten/kernels/gpu/concat_and_split.h
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
--- a/paddle/pten/kernels/gpu/digamma_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/digamma_grad_kernel.cu
--- a/paddle/pten/kernels/gpu/digamma_kernel.cu
+++ b/paddle/pten/kernels/gpu/digamma_kernel.cu
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
--- a/paddle/pten/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/digamma_grad_kernel_impl.h
--- a/paddle/fluid/eager/legacy/tensor_helper.h
+++ b/paddle/fluid/eager/legacy/tensor_helper.h
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
--- a/paddle/pten/kernels/matmul_kernel.h
+++ b/paddle/pten/kernels/matmul_kernel.h
--- a/paddle/pten/kernels/reshape_kernel.cc
+++ b/paddle/pten/kernels/reshape_kernel.cc
--- a/paddle/pten/kernels/reshape_kernel.h
+++ b/paddle/pten/kernels/reshape_kernel.h
--- a/paddle/pten/kernels/scale_kernel.h
+++ b/paddle/pten/kernels/scale_kernel.h
--- a/paddle/pten/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/pten/kernels/selected_rows/scale_kernel.cc
--- a/paddle/pten/kernels/sign_kernel.h
+++ b/paddle/pten/kernels/sign_kernel.h
--- a/paddle/pten/kernels/sparse/CMakeLists.txt
+++ b/paddle/pten/kernels/sparse/CMakeLists.txt
--- a/paddle/pten/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/pten/kernels/sparse/cpu/sparse_utils_kernel.cc
--- a/paddle/pten/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/pten/kernels/sparse/gpu/sparse_utils_kernel.cu
--- a/paddle/pten/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/pten/kernels/sparse/sparse_utils_kernel.h
--- a/paddle/pten/kernels/transfer_layout_kernel.cc
+++ b/paddle/pten/kernels/transfer_layout_kernel.cc
--- a/paddle/pten/kernels/transfer_layout_kernel.h
+++ b/paddle/pten/kernels/transfer_layout_kernel.h
--- a/paddle/pten/kernels/xpu/cast_kernel.cc
+++ b/paddle/pten/kernels/xpu/cast_kernel.cc
--- a/paddle/pten/kernels/xpu/copy_kernel.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
--- a/paddle/pten/kernels/xpu/full_kernel.cc
+++ b/paddle/pten/kernels/xpu/full_kernel.cc
--- a/paddle/pten/kernels/xpu/scale_kernel.cc
+++ b/paddle/pten/kernels/xpu/scale_kernel.cc
--- a/paddle/pten/ops/compat/cast_sig.cc
+++ b/paddle/pten/ops/compat/cast_sig.cc
--- a/paddle/pten/ops/compat/concat_sig.cc
+++ b/paddle/pten/ops/compat/concat_sig.cc
--- a/paddle/pten/ops/compat/elementwise_sig.cc
+++ b/paddle/pten/ops/compat/elementwise_sig.cc
--- a/paddle/pten/ops/compat/empty_sig.cc
+++ b/paddle/pten/ops/compat/empty_sig.cc
--- a/paddle/pten/ops/compat/fill_any_like_sig.cc
+++ b/paddle/pten/ops/compat/fill_any_like_sig.cc
--- a/paddle/pten/ops/compat/fill_constant_sig.cc
+++ b/paddle/pten/ops/compat/fill_constant_sig.cc
--- a/paddle/pten/ops/compat/flatten_sig.cc
+++ b/paddle/pten/ops/compat/flatten_sig.cc
--- a/paddle/pten/ops/compat/matmul_sig.cc
+++ b/paddle/pten/ops/compat/matmul_sig.cc
--- a/paddle/pten/ops/compat/reduce_sig.cc
+++ b/paddle/pten/ops/compat/reduce_sig.cc
--- a/paddle/pten/ops/compat/reshape_sig.cc
+++ b/paddle/pten/ops/compat/reshape_sig.cc
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
--- a/paddle/pten/tests/api/scale_api.h
+++ b/paddle/pten/tests/api/scale_api.h
--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
--- a/paddle/pten/tests/api/test_concat_api.cc
+++ b/paddle/pten/tests/api/test_concat_api.cc
--- a/paddle/pten/tests/api/test_conj_api.cc
+++ b/paddle/pten/tests/api/test_conj_api.cc
--- a/paddle/pten/tests/api/test_data_transform.cc
+++ b/paddle/pten/tests/api/test_data_transform.cc
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
--- a/paddle/pten/tests/api/test_empty_api.cc
+++ b/paddle/pten/tests/api/test_empty_api.cc
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
--- a/paddle/pten/tests/api/test_sparse_utils_api.cc
+++ b/paddle/pten/tests/api/test_sparse_utils_api.cc
--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
--- a/paddle/pten/tests/api/test_to_api.cc
+++ b/paddle/pten/tests/api/test_to_api.cc
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
--- a/paddle/pten/tests/core/test_convert_utils.cc
+++ b/paddle/pten/tests/core/test_convert_utils.cc
--- a/paddle/pten/tests/core/test_device_context.cc
+++ b/paddle/pten/tests/core/test_device_context.cc
--- a/paddle/pten/tests/core/test_dim.cu
+++ b/paddle/pten/tests/core/test_dim.cu
--- a/paddle/pten/tests/core/test_meta_fn_utils.cc
+++ b/paddle/pten/tests/core/test_meta_fn_utils.cc
--- a/paddle/pten/core/unroll_array_ops_test.cc
+++ b/paddle/pten/core/unroll_array_ops_test.cc
--- a/paddle/pten/tests/kernels/CMakeLists.txt
+++ b/paddle/pten/tests/kernels/CMakeLists.txt
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
--- a/paddle/pten/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
--- a/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
--- a/python/paddle/autograd/functional.py
+++ b/python/paddle/autograd/functional.py
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
--- a/python/paddle/distributed/auto_parallel/operators/dist_split.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_split.py
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
--- a/python/paddle/distributed/metric/__init__.py
+++ b/python/paddle/distributed/metric/__init__.py
--- a/python/paddle/distributed/metric/metrics.py
+++ b/python/paddle/distributed/metric/metrics.py
--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
--- a/python/paddle/distribution/__init__.py
+++ b/python/paddle/distribution/__init__.py
--- a/python/paddle/distribution/beta.py
+++ b/python/paddle/distribution/beta.py
--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
--- a/python/paddle/distribution/dirichlet.py
+++ b/python/paddle/distribution/dirichlet.py
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
--- a/python/paddle/distribution/exponential_family.py
+++ b/python/paddle/distribution/exponential_family.py
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
--- a/python/paddle/distribution/multinomial.py
+++ b/python/paddle/distribution/multinomial.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
--- a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
--- a/python/paddle/fluid/tests/unittests/test_diag_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py
--- a/python/paddle/fluid/tests/unittests/test_diagonal_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
--- a/python/paddle/fluid/tests/unittests/test_digamma_op.py
+++ b/python/paddle/fluid/tests/unittests/test_digamma_op.py
--- a/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
+++ b/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
--- a/python/paddle/utils/code_gen/gen_utils.py
+++ b/python/paddle/utils/code_gen/gen_utils.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh